xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision feff33ee0b5b037fa8f9f294dede656a2f85cc47)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   ierr = PetscObjectOptionsBegin((PetscObject)A);
181   if (A->factortype==MAT_FACTOR_NONE) {
182     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
183                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
184     if (flg) {
185       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
186     }
187   }
188   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
189                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190   if (flg) {
191     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
192   }
193   ierr = PetscOptionsEnd();CHKERRQ(ierr);
194   PetscFunctionReturn(0);
195 
196 }
197 
198 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
199 {
200   PetscErrorCode ierr;
201 
202   PetscFunctionBegin;
203   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
204   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
205   PetscFunctionReturn(0);
206 }
207 
208 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
209 {
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
214   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
215   PetscFunctionReturn(0);
216 }
217 
218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
219 {
220   PetscErrorCode ierr;
221 
222   PetscFunctionBegin;
223   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
224   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
229 {
230   PetscErrorCode ierr;
231 
232   PetscFunctionBegin;
233   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
234   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
239 {
240   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
241   PetscInt                          n = A->rmap->n;
242   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
243   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
244   cusparseStatus_t                  stat;
245   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
246   const MatScalar                   *aa = a->a,*v;
247   PetscInt                          *AiLo, *AjLo;
248   PetscScalar                       *AALo;
249   PetscInt                          i,nz, nzLower, offset, rowOffset;
250   PetscErrorCode                    ierr;
251 
252   PetscFunctionBegin;
253   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
254     try {
255       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
256       nzLower=n+ai[n]-ai[1];
257 
258       /* Allocate Space for the lower triangular matrix */
259       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
261       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
262 
263       /* Fill the lower triangular matrix */
264       AiLo[0]  = (PetscInt) 0;
265       AiLo[n]  = nzLower;
266       AjLo[0]  = (PetscInt) 0;
267       AALo[0]  = (MatScalar) 1.0;
268       v        = aa;
269       vi       = aj;
270       offset   = 1;
271       rowOffset= 1;
272       for (i=1; i<n; i++) {
273         nz = ai[i+1] - ai[i];
274         /* additional 1 for the term on the diagonal */
275         AiLo[i]    = rowOffset;
276         rowOffset += nz+1;
277 
278         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
279         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
280 
281         offset      += nz;
282         AjLo[offset] = (PetscInt) i;
283         AALo[offset] = (MatScalar) 1.0;
284         offset      += 1;
285 
286         v  += nz;
287         vi += nz;
288       }
289 
290       /* allocate space for the triangular factor information */
291       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
292 
293       /* Create the matrix description */
294       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
295       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
296       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
297       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
298       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
299 
300       /* Create the solve analysis information */
301       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
302 
303       /* set the operation */
304       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
305 
306       /* set the matrix */
307       loTriFactor->csrMat = new CsrMatrix;
308       loTriFactor->csrMat->num_rows = n;
309       loTriFactor->csrMat->num_cols = n;
310       loTriFactor->csrMat->num_entries = nzLower;
311 
312       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
313       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
314 
315       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
316       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
317 
318       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
319       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
320 
321       /* perform the solve analysis */
322       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
323                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
324                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
325                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
326 
327       /* assign the pointer. Is this really necessary? */
328       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
329 
330       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
332       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353 
354   PetscFunctionBegin;
355   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
385         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431     } catch(char *ex) {
432       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433     }
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439 {
440   PetscErrorCode               ierr;
441   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
442   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443   IS                           isrow = a->row,iscol = a->icol;
444   PetscBool                    row_identity,col_identity;
445   const PetscInt               *r,*c;
446   PetscInt                     n = A->rmap->n;
447 
448   PetscFunctionBegin;
449   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
450   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
451 
452   cusparseTriFactors->workVector = new THRUSTARRAY;
453   cusparseTriFactors->workVector->resize(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457   /*lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /*upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
474   PetscFunctionReturn(0);
475 }
476 
477 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
478 {
479   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
480   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
481   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
482   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
483   cusparseStatus_t                  stat;
484   PetscErrorCode                    ierr;
485   PetscInt                          *AiUp, *AjUp;
486   PetscScalar                       *AAUp;
487   PetscScalar                       *AALo;
488   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
489   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
490   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
491   const MatScalar                   *aa = b->a,*v;
492 
493   PetscFunctionBegin;
494   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
495     try {
496       /* Allocate Space for the upper triangular matrix */
497       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
498       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
499       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
500       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
501 
502       /* Fill the upper triangular matrix */
503       AiUp[0]=(PetscInt) 0;
504       AiUp[n]=nzUpper;
505       offset = 0;
506       for (i=0; i<n; i++) {
507         /* set the pointers */
508         v  = aa + ai[i];
509         vj = aj + ai[i];
510         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
511 
512         /* first, set the diagonal elements */
513         AjUp[offset] = (PetscInt) i;
514         AAUp[offset] = (MatScalar)1.0/v[nz];
515         AiUp[i]      = offset;
516         AALo[offset] = (MatScalar)1.0/v[nz];
517 
518         offset+=1;
519         if (nz>0) {
520           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
521           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
522           for (j=offset; j<offset+nz; j++) {
523             AAUp[j] = -AAUp[j];
524             AALo[j] = AAUp[j]/v[nz];
525           }
526           offset+=nz;
527         }
528       }
529 
530       /* allocate space for the triangular factor information */
531       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
532 
533       /* Create the matrix description */
534       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
535       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
536       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
537       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
538       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
539 
540       /* Create the solve analysis information */
541       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
542 
543       /* set the operation */
544       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
545 
546       /* set the matrix */
547       upTriFactor->csrMat = new CsrMatrix;
548       upTriFactor->csrMat->num_rows = A->rmap->n;
549       upTriFactor->csrMat->num_cols = A->cmap->n;
550       upTriFactor->csrMat->num_entries = a->nz;
551 
552       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
553       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
554 
555       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
556       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
557 
558       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
559       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
560 
561       /* perform the solve analysis */
562       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
563                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
564                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
565                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
566 
567       /* assign the pointer. Is this really necessary? */
568       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
569 
570       /* allocate space for the triangular factor information */
571       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
572 
573       /* Create the matrix description */
574       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
575       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
576       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
577       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
578       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
579 
580       /* Create the solve analysis information */
581       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
582 
583       /* set the operation */
584       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
585 
586       /* set the matrix */
587       loTriFactor->csrMat = new CsrMatrix;
588       loTriFactor->csrMat->num_rows = A->rmap->n;
589       loTriFactor->csrMat->num_cols = A->cmap->n;
590       loTriFactor->csrMat->num_entries = a->nz;
591 
592       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
593       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
594 
595       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
596       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
597 
598       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
599       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
600 
601       /* perform the solve analysis */
602       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
603                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
604                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
605                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
606 
607       /* assign the pointer. Is this really necessary? */
608       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
609 
610       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
611       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
612       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
613       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
614       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
615     } catch(char *ex) {
616       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
617     }
618   }
619   PetscFunctionReturn(0);
620 }
621 
622 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
623 {
624   PetscErrorCode               ierr;
625   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
626   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
627   IS                           ip = a->row;
628   const PetscInt               *rip;
629   PetscBool                    perm_identity;
630   PetscInt                     n = A->rmap->n;
631 
632   PetscFunctionBegin;
633   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
634   cusparseTriFactors->workVector = new THRUSTARRAY;
635   cusparseTriFactors->workVector->resize(n);
636   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
637 
638   /*lower triangular indices */
639   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
640   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
641   if (!perm_identity) {
642     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
643     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
644     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
645     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
646   }
647   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
648   PetscFunctionReturn(0);
649 }
650 
651 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
652 {
653   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
654   IS             isrow = b->row,iscol = b->col;
655   PetscBool      row_identity,col_identity;
656   PetscErrorCode ierr;
657 
658   PetscFunctionBegin;
659   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
660   /* determine which version of MatSolve needs to be used. */
661   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
662   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
663   if (row_identity && col_identity) {
664     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
665     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
666   } else {
667     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
668     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
669   }
670 
671   /* get the triangular factors */
672   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
673   PetscFunctionReturn(0);
674 }
675 
676 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
677 {
678   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
679   IS             ip = b->row;
680   PetscBool      perm_identity;
681   PetscErrorCode ierr;
682 
683   PetscFunctionBegin;
684   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
685 
686   /* determine which version of MatSolve needs to be used. */
687   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
688   if (perm_identity) {
689     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
690     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
691   } else {
692     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
693     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
694   }
695 
696   /* get the triangular factors */
697   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
698   PetscFunctionReturn(0);
699 }
700 
701 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
702 {
703   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
705   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
706   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
707   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
708   cusparseStatus_t                  stat;
709   cusparseIndexBase_t               indexBase;
710   cusparseMatrixType_t              matrixType;
711   cusparseFillMode_t                fillMode;
712   cusparseDiagType_t                diagType;
713 
714   PetscFunctionBegin;
715 
716   /*********************************************/
717   /* Now the Transpose of the Lower Tri Factor */
718   /*********************************************/
719 
720   /* allocate space for the transpose of the lower triangular factor */
721   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
722 
723   /* set the matrix descriptors of the lower triangular factor */
724   matrixType = cusparseGetMatType(loTriFactor->descr);
725   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
726   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
727     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
728   diagType = cusparseGetMatDiagType(loTriFactor->descr);
729 
730   /* Create the matrix description */
731   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
732   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
733   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
734   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
735   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
736 
737   /* Create the solve analysis information */
738   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
739 
740   /* set the operation */
741   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
742 
743   /* allocate GPU space for the CSC of the lower triangular factor*/
744   loTriFactorT->csrMat = new CsrMatrix;
745   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
746   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
747   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
748   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
749   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
750   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
751 
752   /* compute the transpose of the lower triangular factor, i.e. the CSC */
753   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
754                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
755                           loTriFactor->csrMat->values->data().get(),
756                           loTriFactor->csrMat->row_offsets->data().get(),
757                           loTriFactor->csrMat->column_indices->data().get(),
758                           loTriFactorT->csrMat->values->data().get(),
759                           loTriFactorT->csrMat->column_indices->data().get(),
760                           loTriFactorT->csrMat->row_offsets->data().get(),
761                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
762 
763   /* perform the solve analysis on the transposed matrix */
764   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
765                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
766                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
767                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
768                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
769 
770   /* assign the pointer. Is this really necessary? */
771   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
772 
773   /*********************************************/
774   /* Now the Transpose of the Upper Tri Factor */
775   /*********************************************/
776 
777   /* allocate space for the transpose of the upper triangular factor */
778   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
779 
780   /* set the matrix descriptors of the upper triangular factor */
781   matrixType = cusparseGetMatType(upTriFactor->descr);
782   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
783   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
784     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
785   diagType = cusparseGetMatDiagType(upTriFactor->descr);
786 
787   /* Create the matrix description */
788   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
789   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
790   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
791   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
792   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
793 
794   /* Create the solve analysis information */
795   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
796 
797   /* set the operation */
798   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
799 
800   /* allocate GPU space for the CSC of the upper triangular factor*/
801   upTriFactorT->csrMat = new CsrMatrix;
802   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
803   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
804   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
805   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
806   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
807   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
808 
809   /* compute the transpose of the upper triangular factor, i.e. the CSC */
810   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
811                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
812                           upTriFactor->csrMat->values->data().get(),
813                           upTriFactor->csrMat->row_offsets->data().get(),
814                           upTriFactor->csrMat->column_indices->data().get(),
815                           upTriFactorT->csrMat->values->data().get(),
816                           upTriFactorT->csrMat->column_indices->data().get(),
817                           upTriFactorT->csrMat->row_offsets->data().get(),
818                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
819 
820   /* perform the solve analysis on the transposed matrix */
821   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
822                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
823                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
824                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
825                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
826 
827   /* assign the pointer. Is this really necessary? */
828   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
829   PetscFunctionReturn(0);
830 }
831 
832 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
833 {
834   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
835   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
836   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
837   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
838   cusparseStatus_t             stat;
839   cusparseIndexBase_t          indexBase;
840   cudaError_t                  err;
841 
842   PetscFunctionBegin;
843 
844   /* allocate space for the triangular factor information */
845   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
846   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
847   indexBase = cusparseGetMatIndexBase(matstruct->descr);
848   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
849   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
850 
851   /* set alpha and beta */
852   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
853   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
855   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
856   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
857 
858   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
859     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
860     CsrMatrix *matrixT= new CsrMatrix;
861     matrixT->num_rows = A->cmap->n;
862     matrixT->num_cols = A->rmap->n;
863     matrixT->num_entries = a->nz;
864     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
865     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
866     matrixT->values = new THRUSTARRAY(a->nz);
867 
868     /* compute the transpose of the upper triangular factor, i.e. the CSC */
869     indexBase = cusparseGetMatIndexBase(matstruct->descr);
870     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
871                             matrix->num_cols, matrix->num_entries,
872                             matrix->values->data().get(),
873                             matrix->row_offsets->data().get(),
874                             matrix->column_indices->data().get(),
875                             matrixT->values->data().get(),
876                             matrixT->column_indices->data().get(),
877                             matrixT->row_offsets->data().get(),
878                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
879 
880     /* assign the pointer */
881     matstructT->mat = matrixT;
882 
883   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
884 #if CUDA_VERSION>=5000
885     /* First convert HYB to CSR */
886     CsrMatrix *temp= new CsrMatrix;
887     temp->num_rows = A->rmap->n;
888     temp->num_cols = A->cmap->n;
889     temp->num_entries = a->nz;
890     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
891     temp->column_indices = new THRUSTINTARRAY32(a->nz);
892     temp->values = new THRUSTARRAY(a->nz);
893 
894 
895     stat = cusparse_hyb2csr(cusparsestruct->handle,
896                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
897                             temp->values->data().get(),
898                             temp->row_offsets->data().get(),
899                             temp->column_indices->data().get());CHKERRCUDA(stat);
900 
901     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
902     CsrMatrix *tempT= new CsrMatrix;
903     tempT->num_rows = A->rmap->n;
904     tempT->num_cols = A->cmap->n;
905     tempT->num_entries = a->nz;
906     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
907     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
908     tempT->values = new THRUSTARRAY(a->nz);
909 
910     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
911                             temp->num_cols, temp->num_entries,
912                             temp->values->data().get(),
913                             temp->row_offsets->data().get(),
914                             temp->column_indices->data().get(),
915                             tempT->values->data().get(),
916                             tempT->column_indices->data().get(),
917                             tempT->row_offsets->data().get(),
918                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
919 
920     /* Last, convert CSC to HYB */
921     cusparseHybMat_t hybMat;
922     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
923     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
924       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
925     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
926                             matstructT->descr, tempT->values->data().get(),
927                             tempT->row_offsets->data().get(),
928                             tempT->column_indices->data().get(),
929                             hybMat, 0, partition);CHKERRCUDA(stat);
930 
931     /* assign the pointer */
932     matstructT->mat = hybMat;
933 
934     /* delete temporaries */
935     if (tempT) {
936       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
937       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
938       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
939       delete (CsrMatrix*) tempT;
940     }
941     if (temp) {
942       if (temp->values) delete (THRUSTARRAY*) temp->values;
943       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
944       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
945       delete (CsrMatrix*) temp;
946     }
947 #else
948     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
949 #endif
950   }
951   /* assign the compressed row indices */
952   matstructT->cprowIndices = new THRUSTINTARRAY;
953   matstructT->cprowIndices->resize(A->cmap->n);
954   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
955 
956   /* assign the pointer */
957   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
958   PetscFunctionReturn(0);
959 }
960 
961 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
962 {
963   PetscInt                              n = xx->map->n;
964   const PetscScalar                     *barray;
965   PetscScalar                           *xarray;
966   thrust::device_ptr<const PetscScalar> bGPU;
967   thrust::device_ptr<PetscScalar>       xGPU;
968   cusparseStatus_t                      stat;
969   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
970   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
971   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
972   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
973   PetscErrorCode                        ierr;
974 
975   PetscFunctionBegin;
976   /* Analyze the matrix and create the transpose ... on the fly */
977   if (!loTriFactorT && !upTriFactorT) {
978     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
979     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
980     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
981   }
982 
983   /* Get the GPU pointers */
984   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
985   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
986   xGPU = thrust::device_pointer_cast(xarray);
987   bGPU = thrust::device_pointer_cast(barray);
988 
989   /* First, reorder with the row permutation */
990   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
991                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
992                xGPU);
993 
994   /* First, solve U */
995   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
996                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
997                         upTriFactorT->csrMat->values->data().get(),
998                         upTriFactorT->csrMat->row_offsets->data().get(),
999                         upTriFactorT->csrMat->column_indices->data().get(),
1000                         upTriFactorT->solveInfo,
1001                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1002 
1003   /* Then, solve L */
1004   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1005                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1006                         loTriFactorT->csrMat->values->data().get(),
1007                         loTriFactorT->csrMat->row_offsets->data().get(),
1008                         loTriFactorT->csrMat->column_indices->data().get(),
1009                         loTriFactorT->solveInfo,
1010                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1011 
1012   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1013   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1014                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1015                tempGPU->begin());
1016 
1017   /* Copy the temporary to the full solution. */
1018   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1019 
1020   /* restore */
1021   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1022   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1023   ierr = WaitForGPU();CHKERRCUDA(ierr);
1024 
1025   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1030 {
1031   const PetscScalar                 *barray;
1032   PetscScalar                       *xarray;
1033   cusparseStatus_t                  stat;
1034   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1035   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1036   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1037   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1038   PetscErrorCode                    ierr;
1039 
1040   PetscFunctionBegin;
1041   /* Analyze the matrix and create the transpose ... on the fly */
1042   if (!loTriFactorT && !upTriFactorT) {
1043     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1044     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1045     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1046   }
1047 
1048   /* Get the GPU pointers */
1049   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1050   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1051 
1052   /* First, solve U */
1053   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1055                         upTriFactorT->csrMat->values->data().get(),
1056                         upTriFactorT->csrMat->row_offsets->data().get(),
1057                         upTriFactorT->csrMat->column_indices->data().get(),
1058                         upTriFactorT->solveInfo,
1059                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1060 
1061   /* Then, solve L */
1062   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1063                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1064                         loTriFactorT->csrMat->values->data().get(),
1065                         loTriFactorT->csrMat->row_offsets->data().get(),
1066                         loTriFactorT->csrMat->column_indices->data().get(),
1067                         loTriFactorT->solveInfo,
1068                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1069 
1070   /* restore */
1071   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1072   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1073   ierr = WaitForGPU();CHKERRCUDA(ierr);
1074   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1075   PetscFunctionReturn(0);
1076 }
1077 
1078 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1079 {
1080   const PetscScalar                     *barray;
1081   PetscScalar                           *xarray;
1082   thrust::device_ptr<const PetscScalar> bGPU;
1083   thrust::device_ptr<PetscScalar>       xGPU;
1084   cusparseStatus_t                      stat;
1085   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1086   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1087   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1088   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1089   PetscErrorCode                        ierr;
1090 
1091   PetscFunctionBegin;
1092 
1093   /* Get the GPU pointers */
1094   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1095   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1096   xGPU = thrust::device_pointer_cast(xarray);
1097   bGPU = thrust::device_pointer_cast(barray);
1098 
1099   /* First, reorder with the row permutation */
1100   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1101                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1102                xGPU);
1103 
1104   /* Next, solve L */
1105   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1106                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1107                         loTriFactor->csrMat->values->data().get(),
1108                         loTriFactor->csrMat->row_offsets->data().get(),
1109                         loTriFactor->csrMat->column_indices->data().get(),
1110                         loTriFactor->solveInfo,
1111                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1112 
1113   /* Then, solve U */
1114   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1115                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1116                         upTriFactor->csrMat->values->data().get(),
1117                         upTriFactor->csrMat->row_offsets->data().get(),
1118                         upTriFactor->csrMat->column_indices->data().get(),
1119                         upTriFactor->solveInfo,
1120                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1121 
1122   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1123   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1124                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1125                tempGPU->begin());
1126 
1127   /* Copy the temporary to the full solution. */
1128   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1129 
1130   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1131   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1132   ierr = WaitForGPU();CHKERRCUDA(ierr);
1133   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1134   PetscFunctionReturn(0);
1135 }
1136 
1137 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1138 {
1139   const PetscScalar                 *barray;
1140   PetscScalar                       *xarray;
1141   cusparseStatus_t                  stat;
1142   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1143   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1144   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1145   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1146   PetscErrorCode                    ierr;
1147 
1148   PetscFunctionBegin;
1149   /* Get the GPU pointers */
1150   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1151   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1152 
1153   /* First, solve L */
1154   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1155                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1156                         loTriFactor->csrMat->values->data().get(),
1157                         loTriFactor->csrMat->row_offsets->data().get(),
1158                         loTriFactor->csrMat->column_indices->data().get(),
1159                         loTriFactor->solveInfo,
1160                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1161 
1162   /* Next, solve U */
1163   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1164                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1165                         upTriFactor->csrMat->values->data().get(),
1166                         upTriFactor->csrMat->row_offsets->data().get(),
1167                         upTriFactor->csrMat->column_indices->data().get(),
1168                         upTriFactor->solveInfo,
1169                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1170 
1171   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1172   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1173   ierr = WaitForGPU();CHKERRCUDA(ierr);
1174   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1179 {
1180 
1181   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1182   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1183   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1184   PetscInt                     m = A->rmap->n,*ii,*ridx;
1185   PetscErrorCode               ierr;
1186   cusparseStatus_t             stat;
1187   cudaError_t                  err;
1188 
1189   PetscFunctionBegin;
1190   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1191     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1192     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1193       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1194       /* copy values only */
1195       matrix->values->assign(a->a, a->a+a->nz);
1196     } else {
1197       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1198       try {
1199         cusparsestruct->nonzerorow=0;
1200         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1201 
1202         if (a->compressedrow.use) {
1203           m    = a->compressedrow.nrows;
1204           ii   = a->compressedrow.i;
1205           ridx = a->compressedrow.rindex;
1206         } else {
1207           /* Forcing compressed row on the GPU */
1208           int k=0;
1209           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1210           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1211           ii[0]=0;
1212           for (int j = 0; j<m; j++) {
1213             if ((a->i[j+1]-a->i[j])>0) {
1214               ii[k]  = a->i[j];
1215               ridx[k]= j;
1216               k++;
1217             }
1218           }
1219           ii[cusparsestruct->nonzerorow] = a->nz;
1220           m = cusparsestruct->nonzerorow;
1221         }
1222 
1223         /* allocate space for the triangular factor information */
1224         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1225         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1226         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1227         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1228 
1229         err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1230         err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1231         err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1232         err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1233         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1234 
1235         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1236         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1237           /* set the matrix */
1238           CsrMatrix *matrix= new CsrMatrix;
1239           matrix->num_rows = m;
1240           matrix->num_cols = A->cmap->n;
1241           matrix->num_entries = a->nz;
1242           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1243           matrix->row_offsets->assign(ii, ii + m+1);
1244 
1245           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1246           matrix->column_indices->assign(a->j, a->j+a->nz);
1247 
1248           matrix->values = new THRUSTARRAY(a->nz);
1249           matrix->values->assign(a->a, a->a+a->nz);
1250 
1251           /* assign the pointer */
1252           matstruct->mat = matrix;
1253 
1254         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1255 #if CUDA_VERSION>=4020
1256           CsrMatrix *matrix= new CsrMatrix;
1257           matrix->num_rows = m;
1258           matrix->num_cols = A->cmap->n;
1259           matrix->num_entries = a->nz;
1260           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1261           matrix->row_offsets->assign(ii, ii + m+1);
1262 
1263           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1264           matrix->column_indices->assign(a->j, a->j+a->nz);
1265 
1266           matrix->values = new THRUSTARRAY(a->nz);
1267           matrix->values->assign(a->a, a->a+a->nz);
1268 
1269           cusparseHybMat_t hybMat;
1270           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1271           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1272             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1273           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1274               matstruct->descr, matrix->values->data().get(),
1275               matrix->row_offsets->data().get(),
1276               matrix->column_indices->data().get(),
1277               hybMat, 0, partition);CHKERRCUDA(stat);
1278           /* assign the pointer */
1279           matstruct->mat = hybMat;
1280 
1281           if (matrix) {
1282             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1283             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1284             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1285             delete (CsrMatrix*)matrix;
1286           }
1287 #endif
1288         }
1289 
1290         /* assign the compressed row indices */
1291         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1292         matstruct->cprowIndices->assign(ridx,ridx+m);
1293 
1294         /* assign the pointer */
1295         cusparsestruct->mat = matstruct;
1296 
1297         if (!a->compressedrow.use) {
1298           ierr = PetscFree(ii);CHKERRQ(ierr);
1299           ierr = PetscFree(ridx);CHKERRQ(ierr);
1300         }
1301         cusparsestruct->workVector = new THRUSTARRAY;
1302         cusparsestruct->workVector->resize(m);
1303       } catch(char *ex) {
1304         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1305       }
1306       cusparsestruct->nonzerostate = A->nonzerostate;
1307     }
1308     ierr = WaitForGPU();CHKERRCUDA(ierr);
1309     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1310     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1311   }
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1316 {
1317   PetscErrorCode ierr;
1318   PetscInt rbs,cbs;
1319 
1320   PetscFunctionBegin;
1321   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1322   if (right) {
1323     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1324     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1325     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1326     ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr);
1327     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1328   }
1329   if (left) {
1330     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1331     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1332     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1333     ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr);
1334     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1335   }
1336   PetscFunctionReturn(0);
1337 }
1338 
1339 struct VecCUDAPlusEquals
1340 {
1341   template <typename Tuple>
1342   __host__ __device__
1343   void operator()(Tuple t)
1344   {
1345     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1346   }
1347 };
1348 
1349 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1350 {
1351   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1352   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1353   Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */
1354   const PetscScalar            *xarray;
1355   PetscScalar                  *yarray;
1356   PetscErrorCode               ierr;
1357   cusparseStatus_t             stat;
1358 
1359   PetscFunctionBegin;
1360   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1361   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1362   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1363   ierr = VecSet(yy,0);CHKERRQ(ierr);
1364   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1365   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1366   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1367     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1368     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1369                              mat->num_rows, mat->num_cols, mat->num_entries,
1370                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1371                              mat->column_indices->data().get(), xarray, matstruct->beta,
1372                              yarray);CHKERRCUDA(stat);
1373   } else {
1374 #if CUDA_VERSION>=4020
1375     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1376     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1377                              matstruct->alpha, matstruct->descr, hybMat,
1378                              xarray, matstruct->beta,
1379                              yarray);CHKERRCUDA(stat);
1380 #endif
1381   }
1382   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1383   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1384   if (!cusparsestruct->stream) {
1385     ierr = WaitForGPU();CHKERRCUDA(ierr);
1386   }
1387   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1388   PetscFunctionReturn(0);
1389 }
1390 
1391 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1392 {
1393   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1394   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1395   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1396   const PetscScalar            *xarray;
1397   PetscScalar                  *yarray;
1398   PetscErrorCode               ierr;
1399   cusparseStatus_t             stat;
1400 
1401   PetscFunctionBegin;
1402   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1403   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1404   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1405   if (!matstructT) {
1406     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1407     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1408   }
1409   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1410   ierr = VecSet(yy,0);CHKERRQ(ierr);
1411   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1412 
1413   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1414     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1415     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1416                              mat->num_rows, mat->num_cols,
1417                              mat->num_entries, matstructT->alpha, matstructT->descr,
1418                              mat->values->data().get(), mat->row_offsets->data().get(),
1419                              mat->column_indices->data().get(), xarray, matstructT->beta,
1420                              yarray);CHKERRCUDA(stat);
1421   } else {
1422 #if CUDA_VERSION>=4020
1423     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1424     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1425                              matstructT->alpha, matstructT->descr, hybMat,
1426                              xarray, matstructT->beta,
1427                              yarray);CHKERRCUDA(stat);
1428 #endif
1429   }
1430   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1431   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1432   if (!cusparsestruct->stream) {
1433     ierr = WaitForGPU();CHKERRCUDA(ierr);
1434   }
1435   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 
1440 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1441 {
1442   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1443   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1444   Mat_SeqAIJCUSPARSEMultStruct    *matstruct;
1445   thrust::device_ptr<PetscScalar> zptr;
1446   const PetscScalar               *xarray;
1447   PetscScalar                     *zarray;
1448   PetscErrorCode                  ierr;
1449   cusparseStatus_t                stat;
1450 
1451   PetscFunctionBegin;
1452   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1453   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1454   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1455   try {
1456     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1457     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1458     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1459     zptr = thrust::device_pointer_cast(zarray);
1460 
1461     /* multiply add */
1462     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1463       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1464     /* here we need to be careful to set the number of rows in the multiply to the
1465        number of compressed rows in the matrix ... which is equivalent to the
1466        size of the workVector */
1467       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1468                                mat->num_rows, mat->num_cols,
1469                                mat->num_entries, matstruct->alpha, matstruct->descr,
1470                                mat->values->data().get(), mat->row_offsets->data().get(),
1471                                mat->column_indices->data().get(), xarray, matstruct->beta,
1472                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1473     } else {
1474 #if CUDA_VERSION>=4020
1475       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1476       if (cusparsestruct->workVector->size()) {
1477         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1478             matstruct->alpha, matstruct->descr, hybMat,
1479             xarray, matstruct->beta,
1480             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1481       }
1482 #endif
1483     }
1484 
1485     /* scatter the data from the temporary into the full vector with a += operation */
1486     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1487         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1488         VecCUDAPlusEquals());
1489     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1490     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1491 
1492   } catch(char *ex) {
1493     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1494   }
1495   ierr = WaitForGPU();CHKERRCUDA(ierr);
1496   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1497   PetscFunctionReturn(0);
1498 }
1499 
1500 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1501 {
1502   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1503   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1504   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1505   thrust::device_ptr<PetscScalar> zptr;
1506   const PetscScalar               *xarray;
1507   PetscScalar                     *zarray;
1508   PetscErrorCode                  ierr;
1509   cusparseStatus_t                stat;
1510 
1511   PetscFunctionBegin;
1512   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1513   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1514   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1515   if (!matstructT) {
1516     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1517     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1518   }
1519 
1520   try {
1521     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1522     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1523     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1524     zptr = thrust::device_pointer_cast(zarray);
1525 
1526     /* multiply add with matrix transpose */
1527     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1528       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1529       /* here we need to be careful to set the number of rows in the multiply to the
1530          number of compressed rows in the matrix ... which is equivalent to the
1531          size of the workVector */
1532       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1533                                mat->num_rows, mat->num_cols,
1534                                mat->num_entries, matstructT->alpha, matstructT->descr,
1535                                mat->values->data().get(), mat->row_offsets->data().get(),
1536                                mat->column_indices->data().get(), xarray, matstructT->beta,
1537                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1538     } else {
1539 #if CUDA_VERSION>=4020
1540       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1541       if (cusparsestruct->workVector->size()) {
1542         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1543             matstructT->alpha, matstructT->descr, hybMat,
1544             xarray, matstructT->beta,
1545             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1546       }
1547 #endif
1548     }
1549 
1550     /* scatter the data from the temporary into the full vector with a += operation */
1551     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1552         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1553         VecCUDAPlusEquals());
1554 
1555     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1556     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1557 
1558   } catch(char *ex) {
1559     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1560   }
1561   ierr = WaitForGPU();CHKERRCUDA(ierr);
1562   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1567 {
1568   PetscErrorCode ierr;
1569 
1570   PetscFunctionBegin;
1571   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1572   if (A->factortype==MAT_FACTOR_NONE) {
1573     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1574   }
1575   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1576   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1577   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1578   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1579   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1580   PetscFunctionReturn(0);
1581 }
1582 
1583 /* --------------------------------------------------------------------------------*/
1584 /*@
1585    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1586    (the default parallel PETSc format). This matrix will ultimately pushed down
1587    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1588    assembly performance the user should preallocate the matrix storage by setting
1589    the parameter nz (or the array nnz).  By setting these parameters accurately,
1590    performance during matrix assembly can be increased by more than a factor of 50.
1591 
1592    Collective on MPI_Comm
1593 
1594    Input Parameters:
1595 +  comm - MPI communicator, set to PETSC_COMM_SELF
1596 .  m - number of rows
1597 .  n - number of columns
1598 .  nz - number of nonzeros per row (same for all rows)
1599 -  nnz - array containing the number of nonzeros in the various rows
1600          (possibly different for each row) or NULL
1601 
1602    Output Parameter:
1603 .  A - the matrix
1604 
1605    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1606    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1607    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1608 
1609    Notes:
1610    If nnz is given then nz is ignored
1611 
1612    The AIJ format (also called the Yale sparse matrix format or
1613    compressed row storage), is fully compatible with standard Fortran 77
1614    storage.  That is, the stored row and column indices can begin at
1615    either one (as in Fortran) or zero.  See the users' manual for details.
1616 
1617    Specify the preallocated storage with either nz or nnz (not both).
1618    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1619    allocation.  For large problems you MUST preallocate memory or you
1620    will get TERRIBLE performance, see the users' manual chapter on matrices.
1621 
1622    By default, this format uses inodes (identical nodes) when possible, to
1623    improve numerical efficiency of matrix-vector products and solves. We
1624    search for consecutive rows with the same nonzero structure, thereby
1625    reusing matrix information to achieve increased efficiency.
1626 
1627    Level: intermediate
1628 
1629 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1630 @*/
1631 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1632 {
1633   PetscErrorCode ierr;
1634 
1635   PetscFunctionBegin;
1636   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1637   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1638   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1639   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1640   PetscFunctionReturn(0);
1641 }
1642 
1643 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1644 {
1645   PetscErrorCode   ierr;
1646 
1647   PetscFunctionBegin;
1648   if (A->factortype==MAT_FACTOR_NONE) {
1649     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1650       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1651     }
1652   } else {
1653     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1654   }
1655   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1660 {
1661   PetscErrorCode ierr;
1662   Mat C;
1663   cusparseStatus_t stat;
1664   cusparseHandle_t handle=0;
1665 
1666   PetscFunctionBegin;
1667   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1668   C = *B;
1669   /* inject CUSPARSE-specific stuff */
1670   if (C->factortype==MAT_FACTOR_NONE) {
1671     /* you cannot check the inode.use flag here since the matrix was just created.
1672        now build a GPU matrix data structure */
1673     C->spptr = new Mat_SeqAIJCUSPARSE;
1674     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1675     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1676     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1677     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1678     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1679     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = 0;
1680     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1681     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1682     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1683     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1684   } else {
1685     /* NEXT, set the pointers to the triangular factors */
1686     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1687     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1688     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1689     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1690     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1691     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1692     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1693     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1694     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1695     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1696     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1697     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1698   }
1699 
1700   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1701   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1702   C->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1703   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1704   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1705   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1706   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1707   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1708   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1709 
1710   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1711 
1712   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1713 
1714   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1715   PetscFunctionReturn(0);
1716 }
1717 
1718 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1719 {
1720   PetscErrorCode ierr;
1721   cusparseStatus_t stat;
1722   cusparseHandle_t handle=0;
1723 
1724   PetscFunctionBegin;
1725   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1726   if (B->factortype==MAT_FACTOR_NONE) {
1727     /* you cannot check the inode.use flag here since the matrix was just created.
1728        now build a GPU matrix data structure */
1729     B->spptr = new Mat_SeqAIJCUSPARSE;
1730     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1731     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1732     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1733     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1734     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1735     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1736     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1737     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1738     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1739     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1740   } else {
1741     /* NEXT, set the pointers to the triangular factors */
1742     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1743     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1744     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1745     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1746     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1747     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1748     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1749     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1750     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1751     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1752     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1753     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1754   }
1755 
1756   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1757   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1758   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1759   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1760   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1761   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1762   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1763   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1764   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1765 
1766   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1767 
1768   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1769 
1770   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1771   PetscFunctionReturn(0);
1772 }
1773 
1774 /*MC
1775    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1776 
1777    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1778    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1779    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1780 
1781    Options Database Keys:
1782 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1783 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1784 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1785 
1786   Level: beginner
1787 
1788 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1789 M*/
1790 
1791 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1792 
1793 
1794 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1795 {
1796   PetscErrorCode ierr;
1797 
1798   PetscFunctionBegin;
1799   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1800   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1801   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1802   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1803   PetscFunctionReturn(0);
1804 }
1805 
1806 
1807 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1808 {
1809   cusparseStatus_t stat;
1810   cusparseHandle_t handle;
1811 
1812   PetscFunctionBegin;
1813   if (*cusparsestruct) {
1814     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1815     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1816     delete (*cusparsestruct)->workVector;
1817     if (handle = (*cusparsestruct)->handle) {
1818       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1819     }
1820     delete *cusparsestruct;
1821     *cusparsestruct = 0;
1822   }
1823   PetscFunctionReturn(0);
1824 }
1825 
1826 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1827 {
1828   PetscFunctionBegin;
1829   if (*mat) {
1830     delete (*mat)->values;
1831     delete (*mat)->column_indices;
1832     delete (*mat)->row_offsets;
1833     delete *mat;
1834     *mat = 0;
1835   }
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1840 {
1841   cusparseStatus_t stat;
1842   PetscErrorCode   ierr;
1843 
1844   PetscFunctionBegin;
1845   if (*trifactor) {
1846     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1847     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1848     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1849     delete *trifactor;
1850     *trifactor = 0;
1851   }
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1856 {
1857   CsrMatrix        *mat;
1858   cusparseStatus_t stat;
1859   cudaError_t      err;
1860 
1861   PetscFunctionBegin;
1862   if (*matstruct) {
1863     if ((*matstruct)->mat) {
1864       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1865         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1866         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1867       } else {
1868         mat = (CsrMatrix*)(*matstruct)->mat;
1869         CsrMatrix_Destroy(&mat);
1870       }
1871     }
1872     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1873     delete (*matstruct)->cprowIndices;
1874     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1875     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1876     delete *matstruct;
1877     *matstruct = 0;
1878   }
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1883 {
1884   cusparseHandle_t handle;
1885   cusparseStatus_t stat;
1886 
1887   PetscFunctionBegin;
1888   if (*trifactors) {
1889     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1890     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1891     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1892     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1893     delete (*trifactors)->rpermIndices;
1894     delete (*trifactors)->cpermIndices;
1895     delete (*trifactors)->workVector;
1896     if (handle = (*trifactors)->handle) {
1897       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1898     }
1899     delete *trifactors;
1900     *trifactors = 0;
1901   }
1902   PetscFunctionReturn(0);
1903 }
1904 
1905