xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision e2cf4d64f85d15bf608e4b3f49503185edb52e15)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189   if (flg) {
190     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 
195 }
196 
197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
223   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
233   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240   PetscInt                          n = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t                  stat;
244   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245   const MatScalar                   *aa = a->a,*v;
246   PetscInt                          *AiLo, *AjLo;
247   PetscScalar                       *AALo;
248   PetscInt                          i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode                    ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353 
354   PetscFunctionBegin;
355   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457   /*lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /*upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if(!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504     try {
505       /* Allocate Space for the upper triangular matrix */
506       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510 
511       /* Fill the upper triangular matrix */
512       AiUp[0]=(PetscInt) 0;
513       AiUp[n]=nzUpper;
514       offset = 0;
515       for (i=0; i<n; i++) {
516         /* set the pointers */
517         v  = aa + ai[i];
518         vj = aj + ai[i];
519         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
520 
521         /* first, set the diagonal elements */
522         AjUp[offset] = (PetscInt) i;
523         AAUp[offset] = (MatScalar)1.0/v[nz];
524         AiUp[i]      = offset;
525         AALo[offset] = (MatScalar)1.0/v[nz];
526 
527         offset+=1;
528         if (nz>0) {
529           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
530           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
531           for (j=offset; j<offset+nz; j++) {
532             AAUp[j] = -AAUp[j];
533             AALo[j] = AAUp[j]/v[nz];
534           }
535           offset+=nz;
536         }
537       }
538 
539       /* allocate space for the triangular factor information */
540       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
541 
542       /* Create the matrix description */
543       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
548 
549       /* Create the solve analysis information */
550       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
551 
552       /* set the operation */
553       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
554 
555       /* set the matrix */
556       upTriFactor->csrMat = new CsrMatrix;
557       upTriFactor->csrMat->num_rows = A->rmap->n;
558       upTriFactor->csrMat->num_cols = A->cmap->n;
559       upTriFactor->csrMat->num_entries = a->nz;
560 
561       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
563 
564       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
566 
567       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
569 
570       /* perform the solve analysis */
571       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
575 
576       /* assign the pointer. Is this really necessary? */
577       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
578 
579       /* allocate space for the triangular factor information */
580       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
581 
582       /* Create the matrix description */
583       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
588 
589       /* Create the solve analysis information */
590       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
591 
592       /* set the operation */
593       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
594 
595       /* set the matrix */
596       loTriFactor->csrMat = new CsrMatrix;
597       loTriFactor->csrMat->num_rows = A->rmap->n;
598       loTriFactor->csrMat->num_cols = A->cmap->n;
599       loTriFactor->csrMat->num_entries = a->nz;
600 
601       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
603 
604       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
606 
607       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
610 
611       /* perform the solve analysis */
612       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
616 
617       /* assign the pointer. Is this really necessary? */
618       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
619 
620       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
625     } catch(char *ex) {
626       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627     }
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633 {
634   PetscErrorCode               ierr;
635   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637   IS                           ip = a->row;
638   const PetscInt               *rip;
639   PetscBool                    perm_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
644   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
646 
647   /*lower triangular indices */
648   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
649   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
650   if (!perm_identity) {
651     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
652     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
653     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
654     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
655     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
656   }
657   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
662 {
663   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
664   IS             isrow = b->row,iscol = b->col;
665   PetscBool      row_identity,col_identity;
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
670   /* determine which version of MatSolve needs to be used. */
671   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
672   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
673   if (row_identity && col_identity) {
674     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
675     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
676   } else {
677     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
678     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
679   }
680 
681   /* get the triangular factors */
682   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
687 {
688   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
689   IS             ip = b->row;
690   PetscBool      perm_identity;
691   PetscErrorCode ierr;
692 
693   PetscFunctionBegin;
694   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
695 
696   /* determine which version of MatSolve needs to be used. */
697   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
698   if (perm_identity) {
699     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
700     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
701   } else {
702     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
703     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
704   }
705 
706   /* get the triangular factors */
707   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 
711 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
712 {
713   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
716   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
717   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
718   cusparseStatus_t                  stat;
719   cusparseIndexBase_t               indexBase;
720   cusparseMatrixType_t              matrixType;
721   cusparseFillMode_t                fillMode;
722   cusparseDiagType_t                diagType;
723 
724   PetscFunctionBegin;
725 
726   /*********************************************/
727   /* Now the Transpose of the Lower Tri Factor */
728   /*********************************************/
729 
730   /* allocate space for the transpose of the lower triangular factor */
731   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
732 
733   /* set the matrix descriptors of the lower triangular factor */
734   matrixType = cusparseGetMatType(loTriFactor->descr);
735   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
736   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
737     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
738   diagType = cusparseGetMatDiagType(loTriFactor->descr);
739 
740   /* Create the matrix description */
741   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
742   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
743   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
744   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
745   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
746 
747   /* Create the solve analysis information */
748   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
749 
750   /* set the operation */
751   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
752 
753   /* allocate GPU space for the CSC of the lower triangular factor*/
754   loTriFactorT->csrMat = new CsrMatrix;
755   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
756   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
757   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
758   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
759   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
760   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
761 
762   /* compute the transpose of the lower triangular factor, i.e. the CSC */
763   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
764                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
765                           loTriFactor->csrMat->values->data().get(),
766                           loTriFactor->csrMat->row_offsets->data().get(),
767                           loTriFactor->csrMat->column_indices->data().get(),
768                           loTriFactorT->csrMat->values->data().get(),
769                           loTriFactorT->csrMat->column_indices->data().get(),
770                           loTriFactorT->csrMat->row_offsets->data().get(),
771                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
772 
773   /* perform the solve analysis on the transposed matrix */
774   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
775                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
776                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
777                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
778                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
779 
780   /* assign the pointer. Is this really necessary? */
781   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
782 
783   /*********************************************/
784   /* Now the Transpose of the Upper Tri Factor */
785   /*********************************************/
786 
787   /* allocate space for the transpose of the upper triangular factor */
788   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
789 
790   /* set the matrix descriptors of the upper triangular factor */
791   matrixType = cusparseGetMatType(upTriFactor->descr);
792   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
793   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
794     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
795   diagType = cusparseGetMatDiagType(upTriFactor->descr);
796 
797   /* Create the matrix description */
798   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
799   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
800   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
801   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
802   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
803 
804   /* Create the solve analysis information */
805   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
806 
807   /* set the operation */
808   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
809 
810   /* allocate GPU space for the CSC of the upper triangular factor*/
811   upTriFactorT->csrMat = new CsrMatrix;
812   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
813   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
814   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
815   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
816   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
817   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
818 
819   /* compute the transpose of the upper triangular factor, i.e. the CSC */
820   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
821                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
822                           upTriFactor->csrMat->values->data().get(),
823                           upTriFactor->csrMat->row_offsets->data().get(),
824                           upTriFactor->csrMat->column_indices->data().get(),
825                           upTriFactorT->csrMat->values->data().get(),
826                           upTriFactorT->csrMat->column_indices->data().get(),
827                           upTriFactorT->csrMat->row_offsets->data().get(),
828                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
829 
830   /* perform the solve analysis on the transposed matrix */
831   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
832                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
833                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
834                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
835                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
836 
837   /* assign the pointer. Is this really necessary? */
838   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
839   PetscFunctionReturn(0);
840 }
841 
842 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
843 {
844   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
845   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
846   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
847   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
848   cusparseStatus_t             stat;
849   cusparseIndexBase_t          indexBase;
850   cudaError_t                  err;
851   PetscErrorCode               ierr;
852 
853   PetscFunctionBegin;
854 
855   /* allocate space for the triangular factor information */
856   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
857   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
858   indexBase = cusparseGetMatIndexBase(matstruct->descr);
859   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
860   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
861 
862   /* set alpha and beta */
863   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
870 
871   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
872     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
873     CsrMatrix *matrixT= new CsrMatrix;
874     matrixT->num_rows = A->cmap->n;
875     matrixT->num_cols = A->rmap->n;
876     matrixT->num_entries = a->nz;
877     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
878     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
879     matrixT->values = new THRUSTARRAY(a->nz);
880 
881     /* compute the transpose of the upper triangular factor, i.e. the CSC */
882     indexBase = cusparseGetMatIndexBase(matstruct->descr);
883     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
884                             matrix->num_cols, matrix->num_entries,
885                             matrix->values->data().get(),
886                             matrix->row_offsets->data().get(),
887                             matrix->column_indices->data().get(),
888                             matrixT->values->data().get(),
889                             matrixT->column_indices->data().get(),
890                             matrixT->row_offsets->data().get(),
891                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
892 
893     /* assign the pointer */
894     matstructT->mat = matrixT;
895     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
896   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
897 #if CUDA_VERSION>=5000
898     /* First convert HYB to CSR */
899     CsrMatrix *temp= new CsrMatrix;
900     temp->num_rows = A->rmap->n;
901     temp->num_cols = A->cmap->n;
902     temp->num_entries = a->nz;
903     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904     temp->column_indices = new THRUSTINTARRAY32(a->nz);
905     temp->values = new THRUSTARRAY(a->nz);
906 
907 
908     stat = cusparse_hyb2csr(cusparsestruct->handle,
909                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
910                             temp->values->data().get(),
911                             temp->row_offsets->data().get(),
912                             temp->column_indices->data().get());CHKERRCUDA(stat);
913 
914     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
915     CsrMatrix *tempT= new CsrMatrix;
916     tempT->num_rows = A->rmap->n;
917     tempT->num_cols = A->cmap->n;
918     tempT->num_entries = a->nz;
919     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
921     tempT->values = new THRUSTARRAY(a->nz);
922 
923     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
924                             temp->num_cols, temp->num_entries,
925                             temp->values->data().get(),
926                             temp->row_offsets->data().get(),
927                             temp->column_indices->data().get(),
928                             tempT->values->data().get(),
929                             tempT->column_indices->data().get(),
930                             tempT->row_offsets->data().get(),
931                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
932 
933     /* Last, convert CSC to HYB */
934     cusparseHybMat_t hybMat;
935     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
936     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
937       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
938     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
939                             matstructT->descr, tempT->values->data().get(),
940                             tempT->row_offsets->data().get(),
941                             tempT->column_indices->data().get(),
942                             hybMat, 0, partition);CHKERRCUDA(stat);
943 
944     /* assign the pointer */
945     matstructT->mat = hybMat;
946     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
947 
948     /* delete temporaries */
949     if (tempT) {
950       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
951       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
952       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
953       delete (CsrMatrix*) tempT;
954     }
955     if (temp) {
956       if (temp->values) delete (THRUSTARRAY*) temp->values;
957       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
958       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
959       delete (CsrMatrix*) temp;
960     }
961 #else
962     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
963 #endif
964   }
965   /* assign the compressed row indices */
966   matstructT->cprowIndices = new THRUSTINTARRAY;
967   matstructT->cprowIndices->resize(A->cmap->n);
968   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
969   /* assign the pointer */
970   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
971   PetscFunctionReturn(0);
972 }
973 
974 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
975 {
976   PetscInt                              n = xx->map->n;
977   const PetscScalar                     *barray;
978   PetscScalar                           *xarray;
979   thrust::device_ptr<const PetscScalar> bGPU;
980   thrust::device_ptr<PetscScalar>       xGPU;
981   cusparseStatus_t                      stat;
982   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
983   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
984   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
985   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
986   PetscErrorCode                        ierr;
987 
988   PetscFunctionBegin;
989   /* Analyze the matrix and create the transpose ... on the fly */
990   if (!loTriFactorT && !upTriFactorT) {
991     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
992     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
993     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
994   }
995 
996   /* Get the GPU pointers */
997   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
998   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
999   xGPU = thrust::device_pointer_cast(xarray);
1000   bGPU = thrust::device_pointer_cast(barray);
1001 
1002   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1003   /* First, reorder with the row permutation */
1004   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1005                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1006                xGPU);
1007 
1008   /* First, solve U */
1009   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1010                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1011                         upTriFactorT->csrMat->values->data().get(),
1012                         upTriFactorT->csrMat->row_offsets->data().get(),
1013                         upTriFactorT->csrMat->column_indices->data().get(),
1014                         upTriFactorT->solveInfo,
1015                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1016 
1017   /* Then, solve L */
1018   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1019                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1020                         loTriFactorT->csrMat->values->data().get(),
1021                         loTriFactorT->csrMat->row_offsets->data().get(),
1022                         loTriFactorT->csrMat->column_indices->data().get(),
1023                         loTriFactorT->solveInfo,
1024                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1025 
1026   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1027   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1028                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1029                tempGPU->begin());
1030 
1031   /* Copy the temporary to the full solution. */
1032   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1033   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1034 
1035   /* restore */
1036   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1037   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1038   ierr = WaitForGPU();CHKERRCUDA(ierr);
1039   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1040   PetscFunctionReturn(0);
1041 }
1042 
1043 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1044 {
1045   const PetscScalar                 *barray;
1046   PetscScalar                       *xarray;
1047   cusparseStatus_t                  stat;
1048   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1049   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1051   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1052   PetscErrorCode                    ierr;
1053 
1054   PetscFunctionBegin;
1055   /* Analyze the matrix and create the transpose ... on the fly */
1056   if (!loTriFactorT && !upTriFactorT) {
1057     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1058     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1059     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1060   }
1061 
1062   /* Get the GPU pointers */
1063   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1064   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1065 
1066   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1067   /* First, solve U */
1068   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1069                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1070                         upTriFactorT->csrMat->values->data().get(),
1071                         upTriFactorT->csrMat->row_offsets->data().get(),
1072                         upTriFactorT->csrMat->column_indices->data().get(),
1073                         upTriFactorT->solveInfo,
1074                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1075 
1076   /* Then, solve L */
1077   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1078                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1079                         loTriFactorT->csrMat->values->data().get(),
1080                         loTriFactorT->csrMat->row_offsets->data().get(),
1081                         loTriFactorT->csrMat->column_indices->data().get(),
1082                         loTriFactorT->solveInfo,
1083                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1084   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1085 
1086   /* restore */
1087   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1088   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1089   ierr = WaitForGPU();CHKERRCUDA(ierr);
1090   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1095 {
1096   const PetscScalar                     *barray;
1097   PetscScalar                           *xarray;
1098   thrust::device_ptr<const PetscScalar> bGPU;
1099   thrust::device_ptr<PetscScalar>       xGPU;
1100   cusparseStatus_t                      stat;
1101   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1102   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1103   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1104   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1105   PetscErrorCode                        ierr;
1106 
1107   PetscFunctionBegin;
1108 
1109   /* Get the GPU pointers */
1110   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1111   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1112   xGPU = thrust::device_pointer_cast(xarray);
1113   bGPU = thrust::device_pointer_cast(barray);
1114 
1115   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1116   /* First, reorder with the row permutation */
1117   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1118                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1119                xGPU);
1120 
1121   /* Next, solve L */
1122   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1123                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1124                         loTriFactor->csrMat->values->data().get(),
1125                         loTriFactor->csrMat->row_offsets->data().get(),
1126                         loTriFactor->csrMat->column_indices->data().get(),
1127                         loTriFactor->solveInfo,
1128                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1129 
1130   /* Then, solve U */
1131   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1132                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1133                         upTriFactor->csrMat->values->data().get(),
1134                         upTriFactor->csrMat->row_offsets->data().get(),
1135                         upTriFactor->csrMat->column_indices->data().get(),
1136                         upTriFactor->solveInfo,
1137                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1138   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1139 
1140   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1141   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1142                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1143                tempGPU->begin());
1144 
1145   /* Copy the temporary to the full solution. */
1146   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1147 
1148   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1149   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1150   ierr = WaitForGPU();CHKERRCUDA(ierr);
1151   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1152   PetscFunctionReturn(0);
1153 }
1154 
1155 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1156 {
1157   const PetscScalar                 *barray;
1158   PetscScalar                       *xarray;
1159   cusparseStatus_t                  stat;
1160   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1161   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1162   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1163   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1164   PetscErrorCode                    ierr;
1165 
1166   PetscFunctionBegin;
1167   /* Get the GPU pointers */
1168   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1169   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1170 
1171   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1172   /* First, solve L */
1173   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1174                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1175                         loTriFactor->csrMat->values->data().get(),
1176                         loTriFactor->csrMat->row_offsets->data().get(),
1177                         loTriFactor->csrMat->column_indices->data().get(),
1178                         loTriFactor->solveInfo,
1179                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1180 
1181   /* Next, solve U */
1182   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1183                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1184                         upTriFactor->csrMat->values->data().get(),
1185                         upTriFactor->csrMat->row_offsets->data().get(),
1186                         upTriFactor->csrMat->column_indices->data().get(),
1187                         upTriFactor->solveInfo,
1188                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1189   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1190 
1191   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1192   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1193   ierr = WaitForGPU();CHKERRCUDA(ierr);
1194   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1199 {
1200   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1201   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1202   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1203   PetscInt                     m = A->rmap->n,*ii,*ridx;
1204   PetscErrorCode               ierr;
1205   cusparseStatus_t             stat;
1206   cudaError_t                  err;
1207 
1208   PetscFunctionBegin;
1209   if (A->pinnedtocpu) PetscFunctionReturn(0);
1210   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1211     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1212     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1213       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1214       /* copy values only */
1215       matrix->values->assign(a->a, a->a+a->nz);
1216       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1217     } else {
1218       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1219       try {
1220         cusparsestruct->nonzerorow=0;
1221         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1222 
1223         if (a->compressedrow.use) {
1224           m    = a->compressedrow.nrows;
1225           ii   = a->compressedrow.i;
1226           ridx = a->compressedrow.rindex;
1227         } else {
1228           /* Forcing compressed row on the GPU */
1229           int k=0;
1230           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1231           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1232           ii[0]=0;
1233           for (int j = 0; j<m; j++) {
1234             if ((a->i[j+1]-a->i[j])>0) {
1235               ii[k]  = a->i[j];
1236               ridx[k]= j;
1237               k++;
1238             }
1239           }
1240           ii[cusparsestruct->nonzerorow] = a->nz;
1241           m = cusparsestruct->nonzerorow;
1242         }
1243 
1244         /* allocate space for the triangular factor information */
1245         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1246         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1247         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1248         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1249 
1250         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1251         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1252         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1253         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1254         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1255         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1256         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1257 
1258         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1259         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1260           /* set the matrix */
1261           CsrMatrix *matrix= new CsrMatrix;
1262           matrix->num_rows = m;
1263           matrix->num_cols = A->cmap->n;
1264           matrix->num_entries = a->nz;
1265           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1266           matrix->row_offsets->assign(ii, ii + m+1);
1267 
1268           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1269           matrix->column_indices->assign(a->j, a->j+a->nz);
1270 
1271           matrix->values = new THRUSTARRAY(a->nz);
1272           matrix->values->assign(a->a, a->a+a->nz);
1273 
1274           /* assign the pointer */
1275           matstruct->mat = matrix;
1276 
1277         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1278 #if CUDA_VERSION>=4020
1279           CsrMatrix *matrix= new CsrMatrix;
1280           matrix->num_rows = m;
1281           matrix->num_cols = A->cmap->n;
1282           matrix->num_entries = a->nz;
1283           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1284           matrix->row_offsets->assign(ii, ii + m+1);
1285 
1286           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1287           matrix->column_indices->assign(a->j, a->j+a->nz);
1288 
1289           matrix->values = new THRUSTARRAY(a->nz);
1290           matrix->values->assign(a->a, a->a+a->nz);
1291 
1292           cusparseHybMat_t hybMat;
1293           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1294           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1295             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1296           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1297               matstruct->descr, matrix->values->data().get(),
1298               matrix->row_offsets->data().get(),
1299               matrix->column_indices->data().get(),
1300               hybMat, 0, partition);CHKERRCUDA(stat);
1301           /* assign the pointer */
1302           matstruct->mat = hybMat;
1303 
1304           if (matrix) {
1305             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1306             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1307             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1308             delete (CsrMatrix*)matrix;
1309           }
1310 #endif
1311         }
1312 
1313         /* assign the compressed row indices */
1314         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1315         matstruct->cprowIndices->assign(ridx,ridx+m);
1316         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1317 
1318         /* assign the pointer */
1319         cusparsestruct->mat = matstruct;
1320 
1321         if (!a->compressedrow.use) {
1322           ierr = PetscFree(ii);CHKERRQ(ierr);
1323           ierr = PetscFree(ridx);CHKERRQ(ierr);
1324         }
1325         cusparsestruct->workVector = new THRUSTARRAY(m);
1326       } catch(char *ex) {
1327         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1328       }
1329       cusparsestruct->nonzerostate = A->nonzerostate;
1330     }
1331     ierr = WaitForGPU();CHKERRCUDA(ierr);
1332     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1333     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1334   }
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 struct VecCUDAPlusEquals
1339 {
1340   template <typename Tuple>
1341   __host__ __device__
1342   void operator()(Tuple t)
1343   {
1344     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1345   }
1346 };
1347 
1348 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1349 {
1350   PetscErrorCode ierr;
1351 
1352   PetscFunctionBegin;
1353   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1358 {
1359   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1360   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1361   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1362   const PetscScalar            *xarray;
1363   PetscScalar                  *yarray;
1364   PetscErrorCode               ierr;
1365   cusparseStatus_t             stat;
1366 
1367   PetscFunctionBegin;
1368   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1369   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1370   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1371   if (!matstructT) {
1372     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1373     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1374   }
1375   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1376   ierr = VecSet(yy,0);CHKERRQ(ierr);
1377   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1378 
1379   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1380   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1381     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1382     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1383                              mat->num_rows, mat->num_cols,
1384                              mat->num_entries, matstructT->alpha, matstructT->descr,
1385                              mat->values->data().get(), mat->row_offsets->data().get(),
1386                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1387                              yarray);CHKERRCUDA(stat);
1388   } else {
1389 #if CUDA_VERSION>=4020
1390     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1391     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1392                              matstructT->alpha, matstructT->descr, hybMat,
1393                              xarray, matstructT->beta_zero,
1394                              yarray);CHKERRCUDA(stat);
1395 #endif
1396   }
1397   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1398   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1399   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1400   if (!cusparsestruct->stream) {
1401     ierr = WaitForGPU();CHKERRCUDA(ierr);
1402   }
1403   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 
1408 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1409 {
1410   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1411   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1412   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1413   const PetscScalar            *xarray;
1414   PetscScalar                  *zarray,*dptr,*beta;
1415   PetscErrorCode               ierr;
1416   cusparseStatus_t             stat;
1417 
1418   PetscFunctionBegin;
1419   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1420   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1421   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1422   try {
1423     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1424     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1425     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1426     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1427 
1428     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1429     /* csr_spmv is multiply add */
1430     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1431       /* here we need to be careful to set the number of rows in the multiply to the
1432          number of compressed rows in the matrix ... which is equivalent to the
1433          size of the workVector */
1434       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1435       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1436                                mat->num_rows, mat->num_cols,
1437                                mat->num_entries, matstruct->alpha, matstruct->descr,
1438                                mat->values->data().get(), mat->row_offsets->data().get(),
1439                                mat->column_indices->data().get(), xarray, beta,
1440                                dptr);CHKERRCUDA(stat);
1441     } else {
1442 #if CUDA_VERSION>=4020
1443       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1444       if (cusparsestruct->workVector->size()) {
1445         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1446                                  matstruct->alpha, matstruct->descr, hybMat,
1447                                  xarray, beta,
1448                                  dptr);CHKERRCUDA(stat);
1449       }
1450 #endif
1451     }
1452     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1453 
1454     if (yy) {
1455       if (dptr != zarray) {
1456         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1457       } else if (zz != yy) {
1458         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1459       }
1460     } else if (dptr != zarray) {
1461       ierr = VecSet(zz,0);CHKERRQ(ierr);
1462     }
1463     /* scatter the data from the temporary into the full vector with a += operation */
1464     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1465     if (dptr != zarray) {
1466       thrust::device_ptr<PetscScalar> zptr;
1467 
1468       zptr = thrust::device_pointer_cast(zarray);
1469       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1470                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1471                        VecCUDAPlusEquals());
1472     }
1473     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1474     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1475     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1476   } catch(char *ex) {
1477     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1478   }
1479   if (!yy) { /* MatMult */
1480     if (!cusparsestruct->stream) {
1481       ierr = WaitForGPU();CHKERRCUDA(ierr);
1482     }
1483   }
1484   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1489 {
1490   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1491   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1492   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1493   thrust::device_ptr<PetscScalar> zptr;
1494   const PetscScalar               *xarray;
1495   PetscScalar                     *zarray;
1496   PetscErrorCode                  ierr;
1497   cusparseStatus_t                stat;
1498 
1499   PetscFunctionBegin;
1500   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1501   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1502   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1503   if (!matstructT) {
1504     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1505     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1506   }
1507 
1508   try {
1509     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1510     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1511     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1512     zptr = thrust::device_pointer_cast(zarray);
1513 
1514     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1515     /* multiply add with matrix transpose */
1516     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1517       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1518       /* here we need to be careful to set the number of rows in the multiply to the
1519          number of compressed rows in the matrix ... which is equivalent to the
1520          size of the workVector */
1521       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1522                                mat->num_rows, mat->num_cols,
1523                                mat->num_entries, matstructT->alpha, matstructT->descr,
1524                                mat->values->data().get(), mat->row_offsets->data().get(),
1525                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1526                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1527     } else {
1528 #if CUDA_VERSION>=4020
1529       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1530       if (cusparsestruct->workVector->size()) {
1531         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1532             matstructT->alpha, matstructT->descr, hybMat,
1533             xarray, matstructT->beta_zero,
1534             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1535       }
1536 #endif
1537     }
1538 
1539     /* scatter the data from the temporary into the full vector with a += operation */
1540     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1541         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1542         VecCUDAPlusEquals());
1543     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1544     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1545     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1546 
1547   } catch(char *ex) {
1548     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1549   }
1550   ierr = WaitForGPU();CHKERRCUDA(ierr);
1551   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1556 {
1557   PetscErrorCode ierr;
1558 
1559   PetscFunctionBegin;
1560   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1561   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1562   if (A->factortype == MAT_FACTOR_NONE) {
1563     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1564   }
1565   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1566   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1567   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1568   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 /* --------------------------------------------------------------------------------*/
1573 /*@
1574    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1575    (the default parallel PETSc format). This matrix will ultimately pushed down
1576    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1577    assembly performance the user should preallocate the matrix storage by setting
1578    the parameter nz (or the array nnz).  By setting these parameters accurately,
1579    performance during matrix assembly can be increased by more than a factor of 50.
1580 
1581    Collective
1582 
1583    Input Parameters:
1584 +  comm - MPI communicator, set to PETSC_COMM_SELF
1585 .  m - number of rows
1586 .  n - number of columns
1587 .  nz - number of nonzeros per row (same for all rows)
1588 -  nnz - array containing the number of nonzeros in the various rows
1589          (possibly different for each row) or NULL
1590 
1591    Output Parameter:
1592 .  A - the matrix
1593 
1594    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1595    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1596    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1597 
1598    Notes:
1599    If nnz is given then nz is ignored
1600 
1601    The AIJ format (also called the Yale sparse matrix format or
1602    compressed row storage), is fully compatible with standard Fortran 77
1603    storage.  That is, the stored row and column indices can begin at
1604    either one (as in Fortran) or zero.  See the users' manual for details.
1605 
1606    Specify the preallocated storage with either nz or nnz (not both).
1607    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1608    allocation.  For large problems you MUST preallocate memory or you
1609    will get TERRIBLE performance, see the users' manual chapter on matrices.
1610 
1611    By default, this format uses inodes (identical nodes) when possible, to
1612    improve numerical efficiency of matrix-vector products and solves. We
1613    search for consecutive rows with the same nonzero structure, thereby
1614    reusing matrix information to achieve increased efficiency.
1615 
1616    Level: intermediate
1617 
1618 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1619 @*/
1620 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1621 {
1622   PetscErrorCode ierr;
1623 
1624   PetscFunctionBegin;
1625   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1626   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1627   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1628   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1633 {
1634   PetscErrorCode   ierr;
1635 
1636   PetscFunctionBegin;
1637   if (A->factortype==MAT_FACTOR_NONE) {
1638     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1639       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1640     }
1641   } else {
1642     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1643   }
1644   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1649 {
1650   PetscErrorCode ierr;
1651   Mat C;
1652   cusparseStatus_t stat;
1653   cusparseHandle_t handle=0;
1654 
1655   PetscFunctionBegin;
1656   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1657   C    = *B;
1658   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1659   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1660 
1661   /* inject CUSPARSE-specific stuff */
1662   if (C->factortype==MAT_FACTOR_NONE) {
1663     /* you cannot check the inode.use flag here since the matrix was just created.
1664        now build a GPU matrix data structure */
1665     C->spptr = new Mat_SeqAIJCUSPARSE;
1666     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1667     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1668     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1669     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1670     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1671     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1672     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1673     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1674   } else {
1675     /* NEXT, set the pointers to the triangular factors */
1676     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1677     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1678     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1679     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1680     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1681     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1682     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1683     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1684     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1685     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1686     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1687     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1688   }
1689 
1690   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1691   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1692   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1693   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1694   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1695   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1696   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1697   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1698 
1699   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1700 
1701   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1702 
1703   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706 
1707 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1708 {
1709   PetscErrorCode ierr;
1710   cusparseStatus_t stat;
1711   cusparseHandle_t handle=0;
1712 
1713   PetscFunctionBegin;
1714   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1715   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1716 
1717   if (B->factortype==MAT_FACTOR_NONE) {
1718     /* you cannot check the inode.use flag here since the matrix was just created.
1719        now build a GPU matrix data structure */
1720     B->spptr = new Mat_SeqAIJCUSPARSE;
1721     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1722     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1723     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1724     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1725     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1726     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1727     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1728     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1729   } else {
1730     /* NEXT, set the pointers to the triangular factors */
1731     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1733     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1737     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1738     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1739     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1740     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1741     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1742   }
1743 
1744   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1745   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1746   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1747   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1748   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1749   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1750   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1751   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1752 
1753   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1754 
1755   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1756 
1757   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1758   PetscFunctionReturn(0);
1759 }
1760 
1761 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1762 {
1763   PetscErrorCode ierr;
1764 
1765   PetscFunctionBegin;
1766   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1767   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1768   PetscFunctionReturn(0);
1769 }
1770 
1771 /*MC
1772    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1773 
1774    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1775    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1776    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1777 
1778    Options Database Keys:
1779 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1780 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1781 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1782 
1783   Level: beginner
1784 
1785 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1786 M*/
1787 
1788 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1789 
1790 
1791 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1792 {
1793   PetscErrorCode ierr;
1794 
1795   PetscFunctionBegin;
1796   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1797   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1798   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1799   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1800   PetscFunctionReturn(0);
1801 }
1802 
1803 
1804 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1805 {
1806   cusparseStatus_t stat;
1807   cusparseHandle_t handle;
1808 
1809   PetscFunctionBegin;
1810   if (*cusparsestruct) {
1811     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1812     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1813     delete (*cusparsestruct)->workVector;
1814     if (handle = (*cusparsestruct)->handle) {
1815       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1816     }
1817     delete *cusparsestruct;
1818     *cusparsestruct = 0;
1819   }
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1824 {
1825   PetscFunctionBegin;
1826   if (*mat) {
1827     delete (*mat)->values;
1828     delete (*mat)->column_indices;
1829     delete (*mat)->row_offsets;
1830     delete *mat;
1831     *mat = 0;
1832   }
1833   PetscFunctionReturn(0);
1834 }
1835 
1836 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1837 {
1838   cusparseStatus_t stat;
1839   PetscErrorCode   ierr;
1840 
1841   PetscFunctionBegin;
1842   if (*trifactor) {
1843     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1844     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1845     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1846     delete *trifactor;
1847     *trifactor = 0;
1848   }
1849   PetscFunctionReturn(0);
1850 }
1851 
1852 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1853 {
1854   CsrMatrix        *mat;
1855   cusparseStatus_t stat;
1856   cudaError_t      err;
1857 
1858   PetscFunctionBegin;
1859   if (*matstruct) {
1860     if ((*matstruct)->mat) {
1861       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1862         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1863         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1864       } else {
1865         mat = (CsrMatrix*)(*matstruct)->mat;
1866         CsrMatrix_Destroy(&mat);
1867       }
1868     }
1869     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1870     delete (*matstruct)->cprowIndices;
1871     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1872     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1873     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1874     delete *matstruct;
1875     *matstruct = 0;
1876   }
1877   PetscFunctionReturn(0);
1878 }
1879 
1880 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1881 {
1882   cusparseHandle_t handle;
1883   cusparseStatus_t stat;
1884 
1885   PetscFunctionBegin;
1886   if (*trifactors) {
1887     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1888     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1889     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1890     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1891     delete (*trifactors)->rpermIndices;
1892     delete (*trifactors)->cpermIndices;
1893     delete (*trifactors)->workVector;
1894     if (handle = (*trifactors)->handle) {
1895       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1896     }
1897     delete *trifactors;
1898     *trifactors = 0;
1899   }
1900   PetscFunctionReturn(0);
1901 }
1902 
1903