xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision c1fb3f03172742ca6300f9be84ed76fc7deccfba)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189   if (flg) {
190     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 
195 }
196 
197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
223   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
233   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240   PetscInt                          n = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t                  stat;
244   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245   const MatScalar                   *aa = a->a,*v;
246   PetscInt                          *AiLo, *AjLo;
247   PetscScalar                       *AALo;
248   PetscInt                          i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode                    ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353 
354   PetscFunctionBegin;
355   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457   /*lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /*upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if(!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504     try {
505       /* Allocate Space for the upper triangular matrix */
506       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510 
511       /* Fill the upper triangular matrix */
512       AiUp[0]=(PetscInt) 0;
513       AiUp[n]=nzUpper;
514       offset = 0;
515       for (i=0; i<n; i++) {
516         /* set the pointers */
517         v  = aa + ai[i];
518         vj = aj + ai[i];
519         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
520 
521         /* first, set the diagonal elements */
522         AjUp[offset] = (PetscInt) i;
523         AAUp[offset] = (MatScalar)1.0/v[nz];
524         AiUp[i]      = offset;
525         AALo[offset] = (MatScalar)1.0/v[nz];
526 
527         offset+=1;
528         if (nz>0) {
529           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
530           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
531           for (j=offset; j<offset+nz; j++) {
532             AAUp[j] = -AAUp[j];
533             AALo[j] = AAUp[j]/v[nz];
534           }
535           offset+=nz;
536         }
537       }
538 
539       /* allocate space for the triangular factor information */
540       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
541 
542       /* Create the matrix description */
543       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
548 
549       /* Create the solve analysis information */
550       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
551 
552       /* set the operation */
553       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
554 
555       /* set the matrix */
556       upTriFactor->csrMat = new CsrMatrix;
557       upTriFactor->csrMat->num_rows = A->rmap->n;
558       upTriFactor->csrMat->num_cols = A->cmap->n;
559       upTriFactor->csrMat->num_entries = a->nz;
560 
561       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
563 
564       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
566 
567       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
569 
570       /* perform the solve analysis */
571       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
575 
576       /* assign the pointer. Is this really necessary? */
577       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
578 
579       /* allocate space for the triangular factor information */
580       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
581 
582       /* Create the matrix description */
583       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
588 
589       /* Create the solve analysis information */
590       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
591 
592       /* set the operation */
593       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
594 
595       /* set the matrix */
596       loTriFactor->csrMat = new CsrMatrix;
597       loTriFactor->csrMat->num_rows = A->rmap->n;
598       loTriFactor->csrMat->num_cols = A->cmap->n;
599       loTriFactor->csrMat->num_entries = a->nz;
600 
601       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
603 
604       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
606 
607       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
610 
611       /* perform the solve analysis */
612       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
616 
617       /* assign the pointer. Is this really necessary? */
618       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
619 
620       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
625     } catch(char *ex) {
626       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627     }
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633 {
634   PetscErrorCode               ierr;
635   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637   IS                           ip = a->row;
638   const PetscInt               *rip;
639   PetscBool                    perm_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
644   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
646 
647   /*lower triangular indices */
648   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
649   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
650   if (!perm_identity) {
651     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
652     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
653     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
654     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
655     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
656   }
657   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
662 {
663   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
664   IS             isrow = b->row,iscol = b->col;
665   PetscBool      row_identity,col_identity;
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
670   /* determine which version of MatSolve needs to be used. */
671   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
672   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
673   if (row_identity && col_identity) {
674     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
675     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
676   } else {
677     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
678     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
679   }
680 
681   /* get the triangular factors */
682   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
687 {
688   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
689   IS             ip = b->row;
690   PetscBool      perm_identity;
691   PetscErrorCode ierr;
692 
693   PetscFunctionBegin;
694   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
695 
696   /* determine which version of MatSolve needs to be used. */
697   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
698   if (perm_identity) {
699     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
700     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
701   } else {
702     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
703     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
704   }
705 
706   /* get the triangular factors */
707   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 
711 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
712 {
713   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
716   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
717   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
718   cusparseStatus_t                  stat;
719   cusparseIndexBase_t               indexBase;
720   cusparseMatrixType_t              matrixType;
721   cusparseFillMode_t                fillMode;
722   cusparseDiagType_t                diagType;
723 
724   PetscFunctionBegin;
725 
726   /*********************************************/
727   /* Now the Transpose of the Lower Tri Factor */
728   /*********************************************/
729 
730   /* allocate space for the transpose of the lower triangular factor */
731   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
732 
733   /* set the matrix descriptors of the lower triangular factor */
734   matrixType = cusparseGetMatType(loTriFactor->descr);
735   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
736   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
737     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
738   diagType = cusparseGetMatDiagType(loTriFactor->descr);
739 
740   /* Create the matrix description */
741   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
742   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
743   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
744   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
745   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
746 
747   /* Create the solve analysis information */
748   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
749 
750   /* set the operation */
751   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
752 
753   /* allocate GPU space for the CSC of the lower triangular factor*/
754   loTriFactorT->csrMat = new CsrMatrix;
755   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
756   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
757   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
758   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
759   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
760   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
761 
762   /* compute the transpose of the lower triangular factor, i.e. the CSC */
763   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
764                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
765                           loTriFactor->csrMat->values->data().get(),
766                           loTriFactor->csrMat->row_offsets->data().get(),
767                           loTriFactor->csrMat->column_indices->data().get(),
768                           loTriFactorT->csrMat->values->data().get(),
769                           loTriFactorT->csrMat->column_indices->data().get(),
770                           loTriFactorT->csrMat->row_offsets->data().get(),
771                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
772 
773   /* perform the solve analysis on the transposed matrix */
774   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
775                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
776                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
777                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
778                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
779 
780   /* assign the pointer. Is this really necessary? */
781   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
782 
783   /*********************************************/
784   /* Now the Transpose of the Upper Tri Factor */
785   /*********************************************/
786 
787   /* allocate space for the transpose of the upper triangular factor */
788   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
789 
790   /* set the matrix descriptors of the upper triangular factor */
791   matrixType = cusparseGetMatType(upTriFactor->descr);
792   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
793   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
794     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
795   diagType = cusparseGetMatDiagType(upTriFactor->descr);
796 
797   /* Create the matrix description */
798   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
799   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
800   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
801   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
802   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
803 
804   /* Create the solve analysis information */
805   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
806 
807   /* set the operation */
808   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
809 
810   /* allocate GPU space for the CSC of the upper triangular factor*/
811   upTriFactorT->csrMat = new CsrMatrix;
812   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
813   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
814   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
815   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
816   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
817   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
818 
819   /* compute the transpose of the upper triangular factor, i.e. the CSC */
820   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
821                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
822                           upTriFactor->csrMat->values->data().get(),
823                           upTriFactor->csrMat->row_offsets->data().get(),
824                           upTriFactor->csrMat->column_indices->data().get(),
825                           upTriFactorT->csrMat->values->data().get(),
826                           upTriFactorT->csrMat->column_indices->data().get(),
827                           upTriFactorT->csrMat->row_offsets->data().get(),
828                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
829 
830   /* perform the solve analysis on the transposed matrix */
831   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
832                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
833                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
834                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
835                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
836 
837   /* assign the pointer. Is this really necessary? */
838   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
839   PetscFunctionReturn(0);
840 }
841 
842 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
843 {
844   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
845   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
846   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
847   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
848   cusparseStatus_t             stat;
849   cusparseIndexBase_t          indexBase;
850   cudaError_t                  err;
851   PetscErrorCode               ierr;
852 
853   PetscFunctionBegin;
854 
855   /* allocate space for the triangular factor information */
856   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
857   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
858   indexBase = cusparseGetMatIndexBase(matstruct->descr);
859   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
860   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
861 
862   /* set alpha and beta */
863   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
870 
871   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
872     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
873     CsrMatrix *matrixT= new CsrMatrix;
874     matrixT->num_rows = A->cmap->n;
875     matrixT->num_cols = A->rmap->n;
876     matrixT->num_entries = a->nz;
877     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
878     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
879     matrixT->values = new THRUSTARRAY(a->nz);
880 
881     /* compute the transpose of the upper triangular factor, i.e. the CSC */
882     indexBase = cusparseGetMatIndexBase(matstruct->descr);
883     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
884                             matrix->num_cols, matrix->num_entries,
885                             matrix->values->data().get(),
886                             matrix->row_offsets->data().get(),
887                             matrix->column_indices->data().get(),
888                             matrixT->values->data().get(),
889                             matrixT->column_indices->data().get(),
890                             matrixT->row_offsets->data().get(),
891                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
892 
893     /* assign the pointer */
894     matstructT->mat = matrixT;
895     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
896   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
897 #if CUDA_VERSION>=5000
898     /* First convert HYB to CSR */
899     CsrMatrix *temp= new CsrMatrix;
900     temp->num_rows = A->rmap->n;
901     temp->num_cols = A->cmap->n;
902     temp->num_entries = a->nz;
903     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904     temp->column_indices = new THRUSTINTARRAY32(a->nz);
905     temp->values = new THRUSTARRAY(a->nz);
906 
907 
908     stat = cusparse_hyb2csr(cusparsestruct->handle,
909                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
910                             temp->values->data().get(),
911                             temp->row_offsets->data().get(),
912                             temp->column_indices->data().get());CHKERRCUDA(stat);
913 
914     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
915     CsrMatrix *tempT= new CsrMatrix;
916     tempT->num_rows = A->rmap->n;
917     tempT->num_cols = A->cmap->n;
918     tempT->num_entries = a->nz;
919     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
921     tempT->values = new THRUSTARRAY(a->nz);
922 
923     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
924                             temp->num_cols, temp->num_entries,
925                             temp->values->data().get(),
926                             temp->row_offsets->data().get(),
927                             temp->column_indices->data().get(),
928                             tempT->values->data().get(),
929                             tempT->column_indices->data().get(),
930                             tempT->row_offsets->data().get(),
931                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
932 
933     /* Last, convert CSC to HYB */
934     cusparseHybMat_t hybMat;
935     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
936     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
937       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
938     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
939                             matstructT->descr, tempT->values->data().get(),
940                             tempT->row_offsets->data().get(),
941                             tempT->column_indices->data().get(),
942                             hybMat, 0, partition);CHKERRCUDA(stat);
943 
944     /* assign the pointer */
945     matstructT->mat = hybMat;
946     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
947 
948     /* delete temporaries */
949     if (tempT) {
950       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
951       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
952       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
953       delete (CsrMatrix*) tempT;
954     }
955     if (temp) {
956       if (temp->values) delete (THRUSTARRAY*) temp->values;
957       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
958       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
959       delete (CsrMatrix*) temp;
960     }
961 #else
962     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
963 #endif
964   }
965   /* assign the compressed row indices */
966   matstructT->cprowIndices = new THRUSTINTARRAY;
967   matstructT->cprowIndices->resize(A->cmap->n);
968   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
969   /* assign the pointer */
970   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
971   PetscFunctionReturn(0);
972 }
973 
974 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
975 {
976   PetscInt                              n = xx->map->n;
977   const PetscScalar                     *barray;
978   PetscScalar                           *xarray;
979   thrust::device_ptr<const PetscScalar> bGPU;
980   thrust::device_ptr<PetscScalar>       xGPU;
981   cusparseStatus_t                      stat;
982   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
983   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
984   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
985   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
986   PetscErrorCode                        ierr;
987 
988   PetscFunctionBegin;
989   /* Analyze the matrix and create the transpose ... on the fly */
990   if (!loTriFactorT && !upTriFactorT) {
991     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
992     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
993     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
994   }
995 
996   /* Get the GPU pointers */
997   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
998   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
999   xGPU = thrust::device_pointer_cast(xarray);
1000   bGPU = thrust::device_pointer_cast(barray);
1001 
1002   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1003   /* First, reorder with the row permutation */
1004   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1005                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1006                xGPU);
1007 
1008   /* First, solve U */
1009   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1010                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1011                         upTriFactorT->csrMat->values->data().get(),
1012                         upTriFactorT->csrMat->row_offsets->data().get(),
1013                         upTriFactorT->csrMat->column_indices->data().get(),
1014                         upTriFactorT->solveInfo,
1015                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1016 
1017   /* Then, solve L */
1018   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1019                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1020                         loTriFactorT->csrMat->values->data().get(),
1021                         loTriFactorT->csrMat->row_offsets->data().get(),
1022                         loTriFactorT->csrMat->column_indices->data().get(),
1023                         loTriFactorT->solveInfo,
1024                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1025 
1026   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1027   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1028                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1029                tempGPU->begin());
1030 
1031   /* Copy the temporary to the full solution. */
1032   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1033   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1034 
1035   /* restore */
1036   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1037   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1038   ierr = WaitForGPU();CHKERRCUDA(ierr);
1039   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1040   PetscFunctionReturn(0);
1041 }
1042 
1043 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1044 {
1045   const PetscScalar                 *barray;
1046   PetscScalar                       *xarray;
1047   cusparseStatus_t                  stat;
1048   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1049   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1051   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1052   PetscErrorCode                    ierr;
1053 
1054   PetscFunctionBegin;
1055   /* Analyze the matrix and create the transpose ... on the fly */
1056   if (!loTriFactorT && !upTriFactorT) {
1057     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1058     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1059     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1060   }
1061 
1062   /* Get the GPU pointers */
1063   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1064   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1065 
1066   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1067   /* First, solve U */
1068   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1069                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1070                         upTriFactorT->csrMat->values->data().get(),
1071                         upTriFactorT->csrMat->row_offsets->data().get(),
1072                         upTriFactorT->csrMat->column_indices->data().get(),
1073                         upTriFactorT->solveInfo,
1074                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1075 
1076   /* Then, solve L */
1077   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1078                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1079                         loTriFactorT->csrMat->values->data().get(),
1080                         loTriFactorT->csrMat->row_offsets->data().get(),
1081                         loTriFactorT->csrMat->column_indices->data().get(),
1082                         loTriFactorT->solveInfo,
1083                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1084   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1085 
1086   /* restore */
1087   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1088   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1089   ierr = WaitForGPU();CHKERRCUDA(ierr);
1090   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1095 {
1096   const PetscScalar                     *barray;
1097   PetscScalar                           *xarray;
1098   thrust::device_ptr<const PetscScalar> bGPU;
1099   thrust::device_ptr<PetscScalar>       xGPU;
1100   cusparseStatus_t                      stat;
1101   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1102   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1103   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1104   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1105   PetscErrorCode                        ierr;
1106 
1107   PetscFunctionBegin;
1108 
1109   /* Get the GPU pointers */
1110   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1111   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1112   xGPU = thrust::device_pointer_cast(xarray);
1113   bGPU = thrust::device_pointer_cast(barray);
1114 
1115   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1116   /* First, reorder with the row permutation */
1117   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1118                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1119                xGPU);
1120 
1121   /* Next, solve L */
1122   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1123                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1124                         loTriFactor->csrMat->values->data().get(),
1125                         loTriFactor->csrMat->row_offsets->data().get(),
1126                         loTriFactor->csrMat->column_indices->data().get(),
1127                         loTriFactor->solveInfo,
1128                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1129 
1130   /* Then, solve U */
1131   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1132                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1133                         upTriFactor->csrMat->values->data().get(),
1134                         upTriFactor->csrMat->row_offsets->data().get(),
1135                         upTriFactor->csrMat->column_indices->data().get(),
1136                         upTriFactor->solveInfo,
1137                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1138   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1139 
1140   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1141   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1142                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1143                tempGPU->begin());
1144 
1145   /* Copy the temporary to the full solution. */
1146   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1147 
1148   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1149   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1150   ierr = WaitForGPU();CHKERRCUDA(ierr);
1151   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1152   PetscFunctionReturn(0);
1153 }
1154 
1155 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1156 {
1157   const PetscScalar                 *barray;
1158   PetscScalar                       *xarray;
1159   cusparseStatus_t                  stat;
1160   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1161   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1162   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1163   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1164   PetscErrorCode                    ierr;
1165 
1166   PetscFunctionBegin;
1167   /* Get the GPU pointers */
1168   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1169   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1170 
1171   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1172   /* First, solve L */
1173   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1174                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1175                         loTriFactor->csrMat->values->data().get(),
1176                         loTriFactor->csrMat->row_offsets->data().get(),
1177                         loTriFactor->csrMat->column_indices->data().get(),
1178                         loTriFactor->solveInfo,
1179                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1180 
1181   /* Next, solve U */
1182   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1183                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1184                         upTriFactor->csrMat->values->data().get(),
1185                         upTriFactor->csrMat->row_offsets->data().get(),
1186                         upTriFactor->csrMat->column_indices->data().get(),
1187                         upTriFactor->solveInfo,
1188                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1189   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1190 
1191   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1192   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1193   ierr = WaitForGPU();CHKERRCUDA(ierr);
1194   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1199 {
1200   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1201   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1202   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1203   PetscInt                     m = A->rmap->n,*ii,*ridx;
1204   PetscErrorCode               ierr;
1205   cusparseStatus_t             stat;
1206   cudaError_t                  err;
1207 
1208   PetscFunctionBegin;
1209   if (A->pinnedtocpu) PetscFunctionReturn(0);
1210   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1211     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1212     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1213       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1214       /* copy values only */
1215       matrix->values->assign(a->a, a->a+a->nz);
1216       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1217     } else {
1218       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1219       try {
1220         cusparsestruct->nonzerorow=0;
1221         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1222 
1223         if (a->compressedrow.use) {
1224           m    = a->compressedrow.nrows;
1225           ii   = a->compressedrow.i;
1226           ridx = a->compressedrow.rindex;
1227         } else {
1228           /* Forcing compressed row on the GPU */
1229           int k=0;
1230           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1231           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1232           ii[0]=0;
1233           for (int j = 0; j<m; j++) {
1234             if ((a->i[j+1]-a->i[j])>0) {
1235               ii[k]  = a->i[j];
1236               ridx[k]= j;
1237               k++;
1238             }
1239           }
1240           ii[cusparsestruct->nonzerorow] = a->nz;
1241           m = cusparsestruct->nonzerorow;
1242         }
1243 
1244         /* allocate space for the triangular factor information */
1245         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1246         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1247         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1248         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1249 
1250         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1251         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1252         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1253         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1254         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1255         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1256         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1257 
1258         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1259         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1260           /* set the matrix */
1261           CsrMatrix *matrix= new CsrMatrix;
1262           matrix->num_rows = m;
1263           matrix->num_cols = A->cmap->n;
1264           matrix->num_entries = a->nz;
1265           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1266           matrix->row_offsets->assign(ii, ii + m+1);
1267 
1268           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1269           matrix->column_indices->assign(a->j, a->j+a->nz);
1270 
1271           matrix->values = new THRUSTARRAY(a->nz);
1272           matrix->values->assign(a->a, a->a+a->nz);
1273 
1274           /* assign the pointer */
1275           matstruct->mat = matrix;
1276 
1277         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1278 #if CUDA_VERSION>=4020
1279           CsrMatrix *matrix= new CsrMatrix;
1280           matrix->num_rows = m;
1281           matrix->num_cols = A->cmap->n;
1282           matrix->num_entries = a->nz;
1283           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1284           matrix->row_offsets->assign(ii, ii + m+1);
1285 
1286           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1287           matrix->column_indices->assign(a->j, a->j+a->nz);
1288 
1289           matrix->values = new THRUSTARRAY(a->nz);
1290           matrix->values->assign(a->a, a->a+a->nz);
1291 
1292           cusparseHybMat_t hybMat;
1293           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1294           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1295             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1296           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1297               matstruct->descr, matrix->values->data().get(),
1298               matrix->row_offsets->data().get(),
1299               matrix->column_indices->data().get(),
1300               hybMat, 0, partition);CHKERRCUDA(stat);
1301           /* assign the pointer */
1302           matstruct->mat = hybMat;
1303 
1304           if (matrix) {
1305             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1306             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1307             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1308             delete (CsrMatrix*)matrix;
1309           }
1310 #endif
1311         }
1312 
1313         /* assign the compressed row indices */
1314         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1315         matstruct->cprowIndices->assign(ridx,ridx+m);
1316         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1317 
1318         /* assign the pointer */
1319         cusparsestruct->mat = matstruct;
1320 
1321         if (!a->compressedrow.use) {
1322           ierr = PetscFree(ii);CHKERRQ(ierr);
1323           ierr = PetscFree(ridx);CHKERRQ(ierr);
1324         }
1325         cusparsestruct->workVector = new THRUSTARRAY(m);
1326       } catch(char *ex) {
1327         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1328       }
1329       cusparsestruct->nonzerostate = A->nonzerostate;
1330     }
1331     ierr = WaitForGPU();CHKERRCUDA(ierr);
1332     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1333     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1334   }
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 struct VecCUDAPlusEquals
1339 {
1340   template <typename Tuple>
1341   __host__ __device__
1342   void operator()(Tuple t)
1343   {
1344     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1345   }
1346 };
1347 
1348 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1349 {
1350   PetscErrorCode ierr;
1351 
1352   PetscFunctionBegin;
1353   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1358 {
1359   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1360   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1361   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1362   const PetscScalar            *xarray;
1363   PetscScalar                  *yarray;
1364   PetscErrorCode               ierr;
1365   cusparseStatus_t             stat;
1366 
1367   PetscFunctionBegin;
1368   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1369   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1370   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1371   if (!matstructT) {
1372     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1373     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1374   }
1375   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1376   ierr = VecSet(yy,0);CHKERRQ(ierr);
1377   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1378 
1379   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1380   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1381     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1382     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1383                              mat->num_rows, mat->num_cols,
1384                              mat->num_entries, matstructT->alpha, matstructT->descr,
1385                              mat->values->data().get(), mat->row_offsets->data().get(),
1386                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1387                              yarray);CHKERRCUDA(stat);
1388   } else {
1389 #if CUDA_VERSION>=4020
1390     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1391     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1392                              matstructT->alpha, matstructT->descr, hybMat,
1393                              xarray, matstructT->beta_zero,
1394                              yarray);CHKERRCUDA(stat);
1395 #endif
1396   }
1397   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1398   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1399   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1400   if (!cusparsestruct->stream) {
1401     ierr = WaitForGPU();CHKERRCUDA(ierr);
1402   }
1403   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 
1408 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1409 {
1410   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1411   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1412   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1413   const PetscScalar            *xarray;
1414   PetscScalar                  *zarray,*dptr,*beta;
1415   PetscErrorCode               ierr;
1416   cusparseStatus_t             stat;
1417   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1418 
1419   PetscFunctionBegin;
1420   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1421   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1422   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1423   try {
1424     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1425     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1426     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1427       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1428     } else {
1429       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1430     }
1431     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1432     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1433 
1434     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1435     /* csr_spmv is multiply add */
1436     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1437       /* here we need to be careful to set the number of rows in the multiply to the
1438          number of compressed rows in the matrix ... which is equivalent to the
1439          size of the workVector */
1440       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1441       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1442                                mat->num_rows, mat->num_cols,
1443                                mat->num_entries, matstruct->alpha, matstruct->descr,
1444                                mat->values->data().get(), mat->row_offsets->data().get(),
1445                                mat->column_indices->data().get(), xarray, beta,
1446                                dptr);CHKERRCUDA(stat);
1447     } else {
1448 #if CUDA_VERSION>=4020
1449       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1450       if (cusparsestruct->workVector->size()) {
1451         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1452                                  matstruct->alpha, matstruct->descr, hybMat,
1453                                  xarray, beta,
1454                                  dptr);CHKERRCUDA(stat);
1455       }
1456 #endif
1457     }
1458     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1459 
1460     if (yy) {
1461       if (dptr != zarray) {
1462         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1463       } else if (zz != yy) {
1464         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1465       }
1466     } else if (dptr != zarray) {
1467       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1468     }
1469     /* scatter the data from the temporary into the full vector with a += operation */
1470     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1471     if (dptr != zarray) {
1472       thrust::device_ptr<PetscScalar> zptr;
1473 
1474       zptr = thrust::device_pointer_cast(zarray);
1475       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1476                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1477                        VecCUDAPlusEquals());
1478     }
1479     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1480     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1481     if (yy && !cmpr) {
1482       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1483     } else {
1484       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1485     }
1486   } catch(char *ex) {
1487     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1488   }
1489   if (!yy) { /* MatMult */
1490     if (!cusparsestruct->stream) {
1491       ierr = WaitForGPU();CHKERRCUDA(ierr);
1492     }
1493   }
1494   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1499 {
1500   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1501   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1502   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1503   thrust::device_ptr<PetscScalar> zptr;
1504   const PetscScalar               *xarray;
1505   PetscScalar                     *zarray;
1506   PetscErrorCode                  ierr;
1507   cusparseStatus_t                stat;
1508 
1509   PetscFunctionBegin;
1510   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1511   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1512   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1513   if (!matstructT) {
1514     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1515     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1516   }
1517 
1518   try {
1519     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1520     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1521     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1522     zptr = thrust::device_pointer_cast(zarray);
1523 
1524     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1525     /* multiply add with matrix transpose */
1526     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1527       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1528       /* here we need to be careful to set the number of rows in the multiply to the
1529          number of compressed rows in the matrix ... which is equivalent to the
1530          size of the workVector */
1531       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1532                                mat->num_rows, mat->num_cols,
1533                                mat->num_entries, matstructT->alpha, matstructT->descr,
1534                                mat->values->data().get(), mat->row_offsets->data().get(),
1535                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1536                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1537     } else {
1538 #if CUDA_VERSION>=4020
1539       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1540       if (cusparsestruct->workVector->size()) {
1541         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1542             matstructT->alpha, matstructT->descr, hybMat,
1543             xarray, matstructT->beta_zero,
1544             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1545       }
1546 #endif
1547     }
1548 
1549     /* scatter the data from the temporary into the full vector with a += operation */
1550     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1551         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1552         VecCUDAPlusEquals());
1553     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1554     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1555     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1556 
1557   } catch(char *ex) {
1558     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1559   }
1560   ierr = WaitForGPU();CHKERRCUDA(ierr);
1561   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1566 {
1567   PetscErrorCode ierr;
1568 
1569   PetscFunctionBegin;
1570   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1571   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1572   if (A->factortype == MAT_FACTOR_NONE) {
1573     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1574   }
1575   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1576   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1577   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1578   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 /* --------------------------------------------------------------------------------*/
1583 /*@
1584    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1585    (the default parallel PETSc format). This matrix will ultimately pushed down
1586    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1587    assembly performance the user should preallocate the matrix storage by setting
1588    the parameter nz (or the array nnz).  By setting these parameters accurately,
1589    performance during matrix assembly can be increased by more than a factor of 50.
1590 
1591    Collective
1592 
1593    Input Parameters:
1594 +  comm - MPI communicator, set to PETSC_COMM_SELF
1595 .  m - number of rows
1596 .  n - number of columns
1597 .  nz - number of nonzeros per row (same for all rows)
1598 -  nnz - array containing the number of nonzeros in the various rows
1599          (possibly different for each row) or NULL
1600 
1601    Output Parameter:
1602 .  A - the matrix
1603 
1604    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1605    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1606    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1607 
1608    Notes:
1609    If nnz is given then nz is ignored
1610 
1611    The AIJ format (also called the Yale sparse matrix format or
1612    compressed row storage), is fully compatible with standard Fortran 77
1613    storage.  That is, the stored row and column indices can begin at
1614    either one (as in Fortran) or zero.  See the users' manual for details.
1615 
1616    Specify the preallocated storage with either nz or nnz (not both).
1617    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1618    allocation.  For large problems you MUST preallocate memory or you
1619    will get TERRIBLE performance, see the users' manual chapter on matrices.
1620 
1621    By default, this format uses inodes (identical nodes) when possible, to
1622    improve numerical efficiency of matrix-vector products and solves. We
1623    search for consecutive rows with the same nonzero structure, thereby
1624    reusing matrix information to achieve increased efficiency.
1625 
1626    Level: intermediate
1627 
1628 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1629 @*/
1630 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1631 {
1632   PetscErrorCode ierr;
1633 
1634   PetscFunctionBegin;
1635   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1636   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1637   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1638   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1639   PetscFunctionReturn(0);
1640 }
1641 
1642 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1643 {
1644   PetscErrorCode   ierr;
1645 
1646   PetscFunctionBegin;
1647   if (A->factortype==MAT_FACTOR_NONE) {
1648     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1649       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1650     }
1651   } else {
1652     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1653   }
1654   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1655   PetscFunctionReturn(0);
1656 }
1657 
1658 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1659 {
1660   PetscErrorCode ierr;
1661   Mat C;
1662   cusparseStatus_t stat;
1663   cusparseHandle_t handle=0;
1664 
1665   PetscFunctionBegin;
1666   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1667   C    = *B;
1668   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1669   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1670 
1671   /* inject CUSPARSE-specific stuff */
1672   if (C->factortype==MAT_FACTOR_NONE) {
1673     /* you cannot check the inode.use flag here since the matrix was just created.
1674        now build a GPU matrix data structure */
1675     C->spptr = new Mat_SeqAIJCUSPARSE;
1676     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1677     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1678     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1679     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1680     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1681     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1682     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1683     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1684   } else {
1685     /* NEXT, set the pointers to the triangular factors */
1686     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1687     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1688     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1689     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1690     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1691     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1692     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1693     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1694     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1695     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1696     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1697     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1698   }
1699 
1700   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1701   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1702   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1703   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1704   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1705   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1706   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1707   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1708 
1709   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1710 
1711   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1712 
1713   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1714   PetscFunctionReturn(0);
1715 }
1716 
1717 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1718 {
1719   PetscErrorCode ierr;
1720   cusparseStatus_t stat;
1721   cusparseHandle_t handle=0;
1722 
1723   PetscFunctionBegin;
1724   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1725   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1726 
1727   if (B->factortype==MAT_FACTOR_NONE) {
1728     /* you cannot check the inode.use flag here since the matrix was just created.
1729        now build a GPU matrix data structure */
1730     B->spptr = new Mat_SeqAIJCUSPARSE;
1731     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1732     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1733     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1734     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1735     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1736     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1737     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1738     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1739   } else {
1740     /* NEXT, set the pointers to the triangular factors */
1741     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1742     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1743     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1744     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1745     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1746     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1747     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1748     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1749     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1750     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1751     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1752   }
1753 
1754   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1755   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1756   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1757   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1758   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1759   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1760   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1761   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1762 
1763   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1764 
1765   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1766 
1767   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1768   PetscFunctionReturn(0);
1769 }
1770 
1771 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1772 {
1773   PetscErrorCode ierr;
1774 
1775   PetscFunctionBegin;
1776   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1777   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1778   PetscFunctionReturn(0);
1779 }
1780 
1781 /*MC
1782    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1783 
1784    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1785    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1786    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1787 
1788    Options Database Keys:
1789 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1790 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1791 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1792 
1793   Level: beginner
1794 
1795 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1796 M*/
1797 
1798 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1799 
1800 
1801 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1802 {
1803   PetscErrorCode ierr;
1804 
1805   PetscFunctionBegin;
1806   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1807   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1808   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1809   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 
1814 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1815 {
1816   cusparseStatus_t stat;
1817   cusparseHandle_t handle;
1818 
1819   PetscFunctionBegin;
1820   if (*cusparsestruct) {
1821     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1822     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1823     delete (*cusparsestruct)->workVector;
1824     if (handle = (*cusparsestruct)->handle) {
1825       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1826     }
1827     delete *cusparsestruct;
1828     *cusparsestruct = 0;
1829   }
1830   PetscFunctionReturn(0);
1831 }
1832 
1833 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1834 {
1835   PetscFunctionBegin;
1836   if (*mat) {
1837     delete (*mat)->values;
1838     delete (*mat)->column_indices;
1839     delete (*mat)->row_offsets;
1840     delete *mat;
1841     *mat = 0;
1842   }
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1847 {
1848   cusparseStatus_t stat;
1849   PetscErrorCode   ierr;
1850 
1851   PetscFunctionBegin;
1852   if (*trifactor) {
1853     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1854     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1855     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1856     delete *trifactor;
1857     *trifactor = 0;
1858   }
1859   PetscFunctionReturn(0);
1860 }
1861 
1862 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1863 {
1864   CsrMatrix        *mat;
1865   cusparseStatus_t stat;
1866   cudaError_t      err;
1867 
1868   PetscFunctionBegin;
1869   if (*matstruct) {
1870     if ((*matstruct)->mat) {
1871       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1872         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1873         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1874       } else {
1875         mat = (CsrMatrix*)(*matstruct)->mat;
1876         CsrMatrix_Destroy(&mat);
1877       }
1878     }
1879     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1880     delete (*matstruct)->cprowIndices;
1881     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1882     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1883     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1884     delete *matstruct;
1885     *matstruct = 0;
1886   }
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1891 {
1892   cusparseHandle_t handle;
1893   cusparseStatus_t stat;
1894 
1895   PetscFunctionBegin;
1896   if (*trifactors) {
1897     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1898     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1899     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1900     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1901     delete (*trifactors)->rpermIndices;
1902     delete (*trifactors)->cpermIndices;
1903     delete (*trifactors)->workVector;
1904     if (handle = (*trifactors)->handle) {
1905       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1906     }
1907     delete *trifactors;
1908     *trifactors = 0;
1909   }
1910   PetscFunctionReturn(0);
1911 }
1912 
1913