xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 95639643fa8b593562bb075529e8706aa433bb85)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42 
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55 {
56   cusparseStatus_t   stat;
57   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58 
59   PetscFunctionBegin;
60   if (cusparsestruct->handle != handle) {
61     if (cusparsestruct->handle) {
62       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
63     }
64     cusparsestruct->handle = handle;
65   }
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80 {
81   PetscFunctionBegin;
82   *type = MATSOLVERCUSPARSE;
83   PetscFunctionReturn(0);
84 }
85 
86 /*MC
87   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92   algorithms are not recommended. This class does NOT support direct solver operations.
93 
94   Level: beginner
95 
96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97 M*/
98 
99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100 {
101   PetscErrorCode ierr;
102   PetscInt       n = A->rmap->n;
103 
104   PetscFunctionBegin;
105   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
106   (*B)->factortype = ftype;
107   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
108   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
109 
110   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
112     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
118 
119   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
120   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   if (A->factortype==MAT_FACTOR_NONE) {
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
183     }
184   }
185   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187   if (flg) {
188     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248   cudaError_t                       cerr;
249 
250   PetscFunctionBegin;
251   if (!n) PetscFunctionReturn(0);
252   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353   cudaError_t                       cerr;
354 
355   PetscFunctionBegin;
356   if (!n) PetscFunctionReturn(0);
357   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358     try {
359       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360       nzUpper = adiag[0]-adiag[n];
361 
362       /* Allocate Space for the upper triangular matrix */
363       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
366 
367       /* Fill the upper triangular matrix */
368       AiUp[0]=(PetscInt) 0;
369       AiUp[n]=nzUpper;
370       offset = nzUpper;
371       for (i=n-1; i>=0; i--) {
372         v  = aa + adiag[i+1] + 1;
373         vi = aj + adiag[i+1] + 1;
374 
375         /* number of elements NOT on the diagonal */
376         nz = adiag[i] - adiag[i+1]-1;
377 
378         /* decrement the offset */
379         offset -= (nz+1);
380 
381         /* first, set the diagonal elements */
382         AjUp[offset] = (PetscInt) i;
383         AAUp[offset] = (MatScalar)1./v[nz];
384         AiUp[i]      = AiUp[i+1] - (nz+1);
385 
386         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
387         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
388       }
389 
390       /* allocate space for the triangular factor information */
391       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
392 
393       /* Create the matrix description */
394       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
399 
400       /* Create the solve analysis information */
401       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       upTriFactor->csrMat = new CsrMatrix;
408       upTriFactor->csrMat->num_rows = n;
409       upTriFactor->csrMat->num_cols = n;
410       upTriFactor->csrMat->num_entries = nzUpper;
411 
412       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
414 
415       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
417 
418       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
420 
421       /* perform the solve analysis */
422       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
426 
427       /* assign the pointer. Is this really necessary? */
428       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
429 
430       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
434     } catch(char *ex) {
435       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436     }
437   }
438   PetscFunctionReturn(0);
439 }
440 
441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442 {
443   PetscErrorCode               ierr;
444   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446   IS                           isrow = a->row,iscol = a->icol;
447   PetscBool                    row_identity,col_identity;
448   const PetscInt               *r,*c;
449   PetscInt                     n = A->rmap->n;
450 
451   PetscFunctionBegin;
452   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
453   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
454 
455   cusparseTriFactors->workVector = new THRUSTARRAY(n);
456   cusparseTriFactors->nnz=a->nz;
457 
458   A->offloadmask = PETSC_OFFLOAD_BOTH;
459   /* lower triangular indices */
460   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
461   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
462   if (!row_identity) {
463     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
464     cusparseTriFactors->rpermIndices->assign(r, r+n);
465   }
466   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
467 
468   /* upper triangular indices */
469   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
470   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
471   if (!col_identity) {
472     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
473     cusparseTriFactors->cpermIndices->assign(c, c+n);
474   }
475 
476   if (!row_identity && !col_identity) {
477     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!row_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   } else if(!col_identity) {
481     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
482   }
483 
484   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
485   PetscFunctionReturn(0);
486 }
487 
488 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
489 {
490   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
491   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
492   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
493   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
494   cusparseStatus_t                  stat;
495   PetscErrorCode                    ierr;
496   cudaError_t                       cerr;
497   PetscInt                          *AiUp, *AjUp;
498   PetscScalar                       *AAUp;
499   PetscScalar                       *AALo;
500   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
501   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
502   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
503   const MatScalar                   *aa = b->a,*v;
504 
505   PetscFunctionBegin;
506   if (!n) PetscFunctionReturn(0);
507   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
508     try {
509       /* Allocate Space for the upper triangular matrix */
510       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
511       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
512       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
513       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
514 
515       /* Fill the upper triangular matrix */
516       AiUp[0]=(PetscInt) 0;
517       AiUp[n]=nzUpper;
518       offset = 0;
519       for (i=0; i<n; i++) {
520         /* set the pointers */
521         v  = aa + ai[i];
522         vj = aj + ai[i];
523         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
524 
525         /* first, set the diagonal elements */
526         AjUp[offset] = (PetscInt) i;
527         AAUp[offset] = (MatScalar)1.0/v[nz];
528         AiUp[i]      = offset;
529         AALo[offset] = (MatScalar)1.0/v[nz];
530 
531         offset+=1;
532         if (nz>0) {
533           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
534           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
535           for (j=offset; j<offset+nz; j++) {
536             AAUp[j] = -AAUp[j];
537             AALo[j] = AAUp[j]/v[nz];
538           }
539           offset+=nz;
540         }
541       }
542 
543       /* allocate space for the triangular factor information */
544       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
545 
546       /* Create the matrix description */
547       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
548       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
549       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
550       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
551       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
552 
553       /* Create the solve analysis information */
554       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
555 
556       /* set the operation */
557       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
558 
559       /* set the matrix */
560       upTriFactor->csrMat = new CsrMatrix;
561       upTriFactor->csrMat->num_rows = A->rmap->n;
562       upTriFactor->csrMat->num_cols = A->cmap->n;
563       upTriFactor->csrMat->num_entries = a->nz;
564 
565       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
566       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
567 
568       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
569       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
570 
571       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
572       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
573 
574       /* perform the solve analysis */
575       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
576                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
577                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
578                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
579 
580       /* assign the pointer. Is this really necessary? */
581       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
582 
583       /* allocate space for the triangular factor information */
584       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
585 
586       /* Create the matrix description */
587       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
588       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
589       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
590       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
591       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
592 
593       /* Create the solve analysis information */
594       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
595 
596       /* set the operation */
597       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
598 
599       /* set the matrix */
600       loTriFactor->csrMat = new CsrMatrix;
601       loTriFactor->csrMat->num_rows = A->rmap->n;
602       loTriFactor->csrMat->num_cols = A->cmap->n;
603       loTriFactor->csrMat->num_entries = a->nz;
604 
605       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
606       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
607 
608       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
610 
611       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
612       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
613       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
614 
615       /* perform the solve analysis */
616       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
617                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
618                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
619                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
620 
621       /* assign the pointer. Is this really necessary? */
622       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
623 
624       A->offloadmask = PETSC_OFFLOAD_BOTH;
625       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
626       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
627       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
628       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
629     } catch(char *ex) {
630       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631     }
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
637 {
638   PetscErrorCode               ierr;
639   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641   IS                           ip = a->row;
642   const PetscInt               *rip;
643   PetscBool                    perm_identity;
644   PetscInt                     n = A->rmap->n;
645 
646   PetscFunctionBegin;
647   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
648   cusparseTriFactors->workVector = new THRUSTARRAY(n);
649   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
650 
651   /* lower triangular indices */
652   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
653   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
654   if (!perm_identity) {
655     IS             iip;
656     const PetscInt *irip;
657 
658     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
659     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
660     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
662     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
664     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
665     ierr = ISDestroy(&iip);CHKERRQ(ierr);
666     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
667   }
668   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
669   PetscFunctionReturn(0);
670 }
671 
672 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
673 {
674   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
675   IS             isrow = b->row,iscol = b->col;
676   PetscBool      row_identity,col_identity;
677   PetscErrorCode ierr;
678 
679   PetscFunctionBegin;
680   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
681   /* determine which version of MatSolve needs to be used. */
682   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
683   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
684   if (row_identity && col_identity) {
685     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
686     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
687     B->ops->matsolve = NULL;
688     B->ops->matsolvetranspose = NULL;
689   } else {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
692     B->ops->matsolve = NULL;
693     B->ops->matsolvetranspose = NULL;
694   }
695 
696   /* get the triangular factors */
697   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
698   PetscFunctionReturn(0);
699 }
700 
701 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
702 {
703   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
704   IS             ip = b->row;
705   PetscBool      perm_identity;
706   PetscErrorCode ierr;
707 
708   PetscFunctionBegin;
709   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
710 
711   /* determine which version of MatSolve needs to be used. */
712   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
713   if (perm_identity) {
714     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
715     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
716     B->ops->matsolve = NULL;
717     B->ops->matsolvetranspose = NULL;
718   } else {
719     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
720     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
721     B->ops->matsolve = NULL;
722     B->ops->matsolvetranspose = NULL;
723   }
724 
725   /* get the triangular factors */
726   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
731 {
732   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
733   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
734   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
736   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
737   cusparseStatus_t                  stat;
738   cusparseIndexBase_t               indexBase;
739   cusparseMatrixType_t              matrixType;
740   cusparseFillMode_t                fillMode;
741   cusparseDiagType_t                diagType;
742 
743   PetscFunctionBegin;
744 
745   /*********************************************/
746   /* Now the Transpose of the Lower Tri Factor */
747   /*********************************************/
748 
749   /* allocate space for the transpose of the lower triangular factor */
750   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
751 
752   /* set the matrix descriptors of the lower triangular factor */
753   matrixType = cusparseGetMatType(loTriFactor->descr);
754   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
755   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
756     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
757   diagType = cusparseGetMatDiagType(loTriFactor->descr);
758 
759   /* Create the matrix description */
760   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
761   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
762   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
763   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
764   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
765 
766   /* Create the solve analysis information */
767   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
768 
769   /* set the operation */
770   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
771 
772   /* allocate GPU space for the CSC of the lower triangular factor*/
773   loTriFactorT->csrMat = new CsrMatrix;
774   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
775   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
776   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
777   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
778   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
779   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
780 
781   /* compute the transpose of the lower triangular factor, i.e. the CSC */
782   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
783                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
784                           loTriFactor->csrMat->values->data().get(),
785                           loTriFactor->csrMat->row_offsets->data().get(),
786                           loTriFactor->csrMat->column_indices->data().get(),
787                           loTriFactorT->csrMat->values->data().get(),
788                           loTriFactorT->csrMat->column_indices->data().get(),
789                           loTriFactorT->csrMat->row_offsets->data().get(),
790                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
791 
792   /* perform the solve analysis on the transposed matrix */
793   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
794                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
795                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
796                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
797                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
798 
799   /* assign the pointer. Is this really necessary? */
800   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
801 
802   /*********************************************/
803   /* Now the Transpose of the Upper Tri Factor */
804   /*********************************************/
805 
806   /* allocate space for the transpose of the upper triangular factor */
807   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
808 
809   /* set the matrix descriptors of the upper triangular factor */
810   matrixType = cusparseGetMatType(upTriFactor->descr);
811   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
812   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
813     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
814   diagType = cusparseGetMatDiagType(upTriFactor->descr);
815 
816   /* Create the matrix description */
817   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
818   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
819   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
820   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
821   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
822 
823   /* Create the solve analysis information */
824   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
825 
826   /* set the operation */
827   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
828 
829   /* allocate GPU space for the CSC of the upper triangular factor*/
830   upTriFactorT->csrMat = new CsrMatrix;
831   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
832   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
833   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
834   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
835   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
836   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
837 
838   /* compute the transpose of the upper triangular factor, i.e. the CSC */
839   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
840                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
841                           upTriFactor->csrMat->values->data().get(),
842                           upTriFactor->csrMat->row_offsets->data().get(),
843                           upTriFactor->csrMat->column_indices->data().get(),
844                           upTriFactorT->csrMat->values->data().get(),
845                           upTriFactorT->csrMat->column_indices->data().get(),
846                           upTriFactorT->csrMat->row_offsets->data().get(),
847                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
848 
849   /* perform the solve analysis on the transposed matrix */
850   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
851                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
852                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
853                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
854                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
855 
856   /* assign the pointer. Is this really necessary? */
857   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
858   PetscFunctionReturn(0);
859 }
860 
861 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
862 {
863   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
864   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
865   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
866   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
867   cusparseStatus_t             stat;
868   cusparseIndexBase_t          indexBase;
869   cudaError_t                  err;
870   PetscErrorCode               ierr;
871 
872   PetscFunctionBegin;
873 
874   /* allocate space for the triangular factor information */
875   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
876   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
877   indexBase = cusparseGetMatIndexBase(matstruct->descr);
878   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
879   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
880 
881   /* set alpha and beta */
882   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
883   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
884   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
885   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
886   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
887   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
888   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
889 
890   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892     CsrMatrix *matrixT= new CsrMatrix;
893     thrust::device_vector<int> *rowoffsets_gpu;
894     const int ncomprow = matstruct->cprowIndices->size();
895     thrust::host_vector<int> cprow(ncomprow);
896     cprow = *matstruct->cprowIndices; // GPU --> CPU
897     matrixT->num_rows = A->cmap->n;
898     matrixT->num_cols = A->rmap->n;
899     matrixT->num_entries = a->nz;
900     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
901     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
902     matrixT->values = new THRUSTARRAY(a->nz);
903     PetscInt k,i;
904     thrust::host_vector<int> rowst_host(A->rmap->n+1);
905 
906     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
907     rowst_host[0] = 0;
908     for (k = 0, i = 0; i < A->rmap->n ; i++) {
909       if (k < ncomprow && i==cprow[k]) {
910 	rowst_host[i+1] = a->i[i+1];
911 	k++;
912       } else rowst_host[i+1] = rowst_host[i];
913     }
914     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
915     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
916     *rowoffsets_gpu = rowst_host; // CPU --> GPU
917 
918     /* compute the transpose of the upper triangular factor, i.e. the CSC */
919     indexBase = cusparseGetMatIndexBase(matstruct->descr);
920     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
921                             A->cmap->n, matrix->num_entries,
922                             matrix->values->data().get(),
923                             rowoffsets_gpu->data().get(),
924                             matrix->column_indices->data().get(),
925                             matrixT->values->data().get(),
926                             matrixT->column_indices->data().get(),
927                             matrixT->row_offsets->data().get(),
928                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
929 
930     /* assign the pointer */
931     matstructT->mat = matrixT;
932     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
933   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
934     /* First convert HYB to CSR */
935     CsrMatrix *temp= new CsrMatrix;
936     temp->num_rows = A->rmap->n;
937     temp->num_cols = A->cmap->n;
938     temp->num_entries = a->nz;
939     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
940     temp->column_indices = new THRUSTINTARRAY32(a->nz);
941     temp->values = new THRUSTARRAY(a->nz);
942 
943 
944     stat = cusparse_hyb2csr(cusparsestruct->handle,
945                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
946                             temp->values->data().get(),
947                             temp->row_offsets->data().get(),
948                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
949 
950     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
951     CsrMatrix *tempT= new CsrMatrix;
952     tempT->num_rows = A->rmap->n;
953     tempT->num_cols = A->cmap->n;
954     tempT->num_entries = a->nz;
955     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
956     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
957     tempT->values = new THRUSTARRAY(a->nz);
958 
959     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
960                             temp->num_cols, temp->num_entries,
961                             temp->values->data().get(),
962                             temp->row_offsets->data().get(),
963                             temp->column_indices->data().get(),
964                             tempT->values->data().get(),
965                             tempT->column_indices->data().get(),
966                             tempT->row_offsets->data().get(),
967                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
968 
969     /* Last, convert CSC to HYB */
970     cusparseHybMat_t hybMat;
971     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
972     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
973       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
974     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
975                             matstructT->descr, tempT->values->data().get(),
976                             tempT->row_offsets->data().get(),
977                             tempT->column_indices->data().get(),
978                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
979 
980     /* assign the pointer */
981     matstructT->mat = hybMat;
982     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
983 
984     /* delete temporaries */
985     if (tempT) {
986       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
987       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
988       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
989       delete (CsrMatrix*) tempT;
990     }
991     if (temp) {
992       if (temp->values) delete (THRUSTARRAY*) temp->values;
993       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
994       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
995       delete (CsrMatrix*) temp;
996     }
997   }
998   /* assign the compressed row indices */
999   matstructT->cprowIndices = new THRUSTINTARRAY;
1000   matstructT->cprowIndices->resize(A->cmap->n);
1001   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
1002   /* assign the pointer */
1003   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1008 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1009 {
1010   PetscInt                              n = xx->map->n;
1011   const PetscScalar                     *barray;
1012   PetscScalar                           *xarray;
1013   thrust::device_ptr<const PetscScalar> bGPU;
1014   thrust::device_ptr<PetscScalar>       xGPU;
1015   cusparseStatus_t                      stat;
1016   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1017   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1018   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1019   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1020   PetscErrorCode                        ierr;
1021   cudaError_t                           cerr;
1022 
1023   PetscFunctionBegin;
1024   /* Analyze the matrix and create the transpose ... on the fly */
1025   if (!loTriFactorT && !upTriFactorT) {
1026     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1027     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1028     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1029   }
1030 
1031   /* Get the GPU pointers */
1032   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1033   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1034   xGPU = thrust::device_pointer_cast(xarray);
1035   bGPU = thrust::device_pointer_cast(barray);
1036 
1037   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1038   /* First, reorder with the row permutation */
1039   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1040                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1041                xGPU);
1042 
1043   /* First, solve U */
1044   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1045                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1046                         upTriFactorT->csrMat->values->data().get(),
1047                         upTriFactorT->csrMat->row_offsets->data().get(),
1048                         upTriFactorT->csrMat->column_indices->data().get(),
1049                         upTriFactorT->solveInfo,
1050                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1051 
1052   /* Then, solve L */
1053   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1054                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1055                         loTriFactorT->csrMat->values->data().get(),
1056                         loTriFactorT->csrMat->row_offsets->data().get(),
1057                         loTriFactorT->csrMat->column_indices->data().get(),
1058                         loTriFactorT->solveInfo,
1059                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1060 
1061   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1062   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1063                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1064                tempGPU->begin());
1065 
1066   /* Copy the temporary to the full solution. */
1067   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1068 
1069   /* restore */
1070   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1071   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1072   cerr = WaitForGPU();CHKERRCUDA(cerr);
1073   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1074   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1075   PetscFunctionReturn(0);
1076 }
1077 
1078 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1079 {
1080   const PetscScalar                 *barray;
1081   PetscScalar                       *xarray;
1082   cusparseStatus_t                  stat;
1083   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1084   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1085   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1086   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1087   PetscErrorCode                    ierr;
1088   cudaError_t                       cerr;
1089 
1090   PetscFunctionBegin;
1091   /* Analyze the matrix and create the transpose ... on the fly */
1092   if (!loTriFactorT && !upTriFactorT) {
1093     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1094     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1095     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1096   }
1097 
1098   /* Get the GPU pointers */
1099   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1100   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1101 
1102   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1103   /* First, solve U */
1104   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1105                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1106                         upTriFactorT->csrMat->values->data().get(),
1107                         upTriFactorT->csrMat->row_offsets->data().get(),
1108                         upTriFactorT->csrMat->column_indices->data().get(),
1109                         upTriFactorT->solveInfo,
1110                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1111 
1112   /* Then, solve L */
1113   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1114                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1115                         loTriFactorT->csrMat->values->data().get(),
1116                         loTriFactorT->csrMat->row_offsets->data().get(),
1117                         loTriFactorT->csrMat->column_indices->data().get(),
1118                         loTriFactorT->solveInfo,
1119                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1120 
1121   /* restore */
1122   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1123   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1124   cerr = WaitForGPU();CHKERRCUDA(cerr);
1125   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1126   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1127   PetscFunctionReturn(0);
1128 }
1129 
1130 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1131 {
1132   const PetscScalar                     *barray;
1133   PetscScalar                           *xarray;
1134   thrust::device_ptr<const PetscScalar> bGPU;
1135   thrust::device_ptr<PetscScalar>       xGPU;
1136   cusparseStatus_t                      stat;
1137   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1138   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1139   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1140   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1141   PetscErrorCode                        ierr;
1142   cudaError_t                           cerr;
1143 
1144   PetscFunctionBegin;
1145 
1146   /* Get the GPU pointers */
1147   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1148   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1149   xGPU = thrust::device_pointer_cast(xarray);
1150   bGPU = thrust::device_pointer_cast(barray);
1151 
1152   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1153   /* First, reorder with the row permutation */
1154   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1155                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1156                tempGPU->begin());
1157 
1158   /* Next, solve L */
1159   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1160                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1161                         loTriFactor->csrMat->values->data().get(),
1162                         loTriFactor->csrMat->row_offsets->data().get(),
1163                         loTriFactor->csrMat->column_indices->data().get(),
1164                         loTriFactor->solveInfo,
1165                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1166 
1167   /* Then, solve U */
1168   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1169                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1170                         upTriFactor->csrMat->values->data().get(),
1171                         upTriFactor->csrMat->row_offsets->data().get(),
1172                         upTriFactor->csrMat->column_indices->data().get(),
1173                         upTriFactor->solveInfo,
1174                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1175 
1176   /* Last, reorder with the column permutation */
1177   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1178                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1179                xGPU);
1180 
1181   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1182   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1183   cerr = WaitForGPU();CHKERRCUDA(cerr);
1184   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1185   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1186   PetscFunctionReturn(0);
1187 }
1188 
1189 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1190 {
1191   const PetscScalar                 *barray;
1192   PetscScalar                       *xarray;
1193   cusparseStatus_t                  stat;
1194   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1195   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1196   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1197   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1198   PetscErrorCode                    ierr;
1199   cudaError_t                       cerr;
1200 
1201   PetscFunctionBegin;
1202   /* Get the GPU pointers */
1203   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1204   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1205 
1206   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1207   /* First, solve L */
1208   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1209                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1210                         loTriFactor->csrMat->values->data().get(),
1211                         loTriFactor->csrMat->row_offsets->data().get(),
1212                         loTriFactor->csrMat->column_indices->data().get(),
1213                         loTriFactor->solveInfo,
1214                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1215 
1216   /* Next, solve U */
1217   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1218                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1219                         upTriFactor->csrMat->values->data().get(),
1220                         upTriFactor->csrMat->row_offsets->data().get(),
1221                         upTriFactor->csrMat->column_indices->data().get(),
1222                         upTriFactor->solveInfo,
1223                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1224 
1225   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1226   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1227   cerr = WaitForGPU();CHKERRCUDA(cerr);
1228   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1229   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1230   PetscFunctionReturn(0);
1231 }
1232 
1233 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1234 {
1235   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1236   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1237   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1238   PetscInt                     m = A->rmap->n,*ii,*ridx;
1239   PetscErrorCode               ierr;
1240   cusparseStatus_t             stat;
1241   cudaError_t                  err;
1242 
1243   PetscFunctionBegin;
1244   if (A->boundtocpu) PetscFunctionReturn(0);
1245   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1246     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1247     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1248       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1249       /* copy values only */
1250       matrix->values->assign(a->a, a->a+a->nz);
1251       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1252     } else {
1253       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1254       try {
1255         cusparsestruct->nonzerorow=0;
1256         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1257 
1258         if (a->compressedrow.use) {
1259           m    = a->compressedrow.nrows;
1260           ii   = a->compressedrow.i;
1261           ridx = a->compressedrow.rindex;
1262         } else {
1263           /* Forcing compressed row on the GPU */
1264           int k=0;
1265           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1266           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1267           ii[0]=0;
1268           for (int j = 0; j<m; j++) {
1269             if ((a->i[j+1]-a->i[j])>0) {
1270               ii[k]  = a->i[j];
1271               ridx[k]= j;
1272               k++;
1273             }
1274           }
1275           ii[cusparsestruct->nonzerorow] = a->nz;
1276           m = cusparsestruct->nonzerorow;
1277         }
1278 
1279         /* allocate space for the triangular factor information */
1280         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1281         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1282         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1283         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1284 
1285         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1286         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1287         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1288         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1289         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1290         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1291         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1292 
1293         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1294         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1295           /* set the matrix */
1296           CsrMatrix *matrix= new CsrMatrix;
1297           matrix->num_rows = m;
1298           matrix->num_cols = A->cmap->n;
1299           matrix->num_entries = a->nz;
1300           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1301           matrix->row_offsets->assign(ii, ii + m+1);
1302 
1303           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1304           matrix->column_indices->assign(a->j, a->j+a->nz);
1305 
1306           matrix->values = new THRUSTARRAY(a->nz);
1307           matrix->values->assign(a->a, a->a+a->nz);
1308 
1309           /* assign the pointer */
1310           matstruct->mat = matrix;
1311 
1312         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1313           CsrMatrix *matrix= new CsrMatrix;
1314           matrix->num_rows = m;
1315           matrix->num_cols = A->cmap->n;
1316           matrix->num_entries = a->nz;
1317           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1318           matrix->row_offsets->assign(ii, ii + m+1);
1319 
1320           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1321           matrix->column_indices->assign(a->j, a->j+a->nz);
1322 
1323           matrix->values = new THRUSTARRAY(a->nz);
1324           matrix->values->assign(a->a, a->a+a->nz);
1325 
1326           cusparseHybMat_t hybMat;
1327           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1328           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1329             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1330           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1331               matstruct->descr, matrix->values->data().get(),
1332               matrix->row_offsets->data().get(),
1333               matrix->column_indices->data().get(),
1334               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1335           /* assign the pointer */
1336           matstruct->mat = hybMat;
1337 
1338           if (matrix) {
1339             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1340             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1341             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1342             delete (CsrMatrix*)matrix;
1343           }
1344         }
1345 
1346         /* assign the compressed row indices */
1347         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1348         matstruct->cprowIndices->assign(ridx,ridx+m);
1349         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1350 
1351         /* assign the pointer */
1352         cusparsestruct->mat = matstruct;
1353 
1354         if (!a->compressedrow.use) {
1355           ierr = PetscFree(ii);CHKERRQ(ierr);
1356           ierr = PetscFree(ridx);CHKERRQ(ierr);
1357         }
1358         cusparsestruct->workVector = new THRUSTARRAY(m);
1359       } catch(char *ex) {
1360         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1361       }
1362       cusparsestruct->nonzerostate = A->nonzerostate;
1363     }
1364     err  = WaitForGPU();CHKERRCUDA(err);
1365     A->offloadmask = PETSC_OFFLOAD_BOTH;
1366     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1367   }
1368   PetscFunctionReturn(0);
1369 }
1370 
1371 struct VecCUDAPlusEquals
1372 {
1373   template <typename Tuple>
1374   __host__ __device__
1375   void operator()(Tuple t)
1376   {
1377     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1378   }
1379 };
1380 
1381 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1382 {
1383   PetscErrorCode ierr;
1384 
1385   PetscFunctionBegin;
1386   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1387   PetscFunctionReturn(0);
1388 }
1389 
1390 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1391 {
1392   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1393   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1394   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1395   const PetscScalar            *xarray;
1396   PetscScalar                  *yarray;
1397   PetscErrorCode               ierr;
1398   cudaError_t                  cerr;
1399   cusparseStatus_t             stat;
1400 
1401   PetscFunctionBegin;
1402   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1403   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1404   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1405   if (!matstructT) {
1406     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1407     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1408   }
1409   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1410   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1411   if (yy->map->n) {
1412     PetscInt                     n = yy->map->n;
1413     cudaError_t                  err;
1414     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1415   }
1416 
1417   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1418   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1419     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1420     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1421                              mat->num_rows, mat->num_cols,
1422                              mat->num_entries, matstructT->alpha, matstructT->descr,
1423                              mat->values->data().get(), mat->row_offsets->data().get(),
1424                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1425                              yarray);CHKERRCUSPARSE(stat);
1426   } else {
1427     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1428     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1429                              matstructT->alpha, matstructT->descr, hybMat,
1430                              xarray, matstructT->beta_zero,
1431                              yarray);CHKERRCUSPARSE(stat);
1432   }
1433   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1434   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1435   cerr = WaitForGPU();CHKERRCUDA(cerr);
1436   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1437   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 
1442 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1443 {
1444   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1445   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1446   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1447   const PetscScalar            *xarray;
1448   PetscScalar                  *zarray,*dptr,*beta;
1449   PetscErrorCode               ierr;
1450   cudaError_t                  cerr;
1451   cusparseStatus_t             stat;
1452   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1453 
1454   PetscFunctionBegin;
1455   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1456   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1457   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1458   try {
1459     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1460     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1461     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1462       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1463     } else {
1464       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1465     }
1466     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1467     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1468 
1469     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1470     /* csr_spmv is multiply add */
1471     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1472       /* here we need to be careful to set the number of rows in the multiply to the
1473          number of compressed rows in the matrix ... which is equivalent to the
1474          size of the workVector */
1475       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1476       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1477                                mat->num_rows, mat->num_cols,
1478                                mat->num_entries, matstruct->alpha, matstruct->descr,
1479                                mat->values->data().get(), mat->row_offsets->data().get(),
1480                                mat->column_indices->data().get(), xarray, beta,
1481                                dptr);CHKERRCUSPARSE(stat);
1482     } else {
1483       if (cusparsestruct->workVector->size()) {
1484         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1485         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1486                                  matstruct->alpha, matstruct->descr, hybMat,
1487                                  xarray, beta,
1488                                  dptr);CHKERRCUSPARSE(stat);
1489       }
1490     }
1491     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1492 
1493     if (yy) {
1494       if (dptr != zarray) {
1495         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1496       } else if (zz != yy) {
1497         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1498       }
1499     } else if (dptr != zarray) {
1500       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1501     }
1502     /* scatter the data from the temporary into the full vector with a += operation */
1503     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1504     if (dptr != zarray) {
1505       thrust::device_ptr<PetscScalar> zptr;
1506 
1507       zptr = thrust::device_pointer_cast(zarray);
1508       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1509                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1510                        VecCUDAPlusEquals());
1511     }
1512     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1513     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1514     if (yy && !cmpr) {
1515       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1516     } else {
1517       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1518     }
1519   } catch(char *ex) {
1520     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1521   }
1522   cerr = WaitForGPU();CHKERRCUDA(cerr);
1523   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1528 {
1529   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1530   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1531   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1532   const PetscScalar               *xarray;
1533   PetscScalar                     *zarray,*dptr,*beta;
1534   PetscErrorCode                  ierr;
1535   cudaError_t                     cerr;
1536   cusparseStatus_t                stat;
1537 
1538   PetscFunctionBegin;
1539   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1540   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1541   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1542   if (!matstructT) {
1543     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1544     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1545   }
1546 
1547   try {
1548     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1549     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1550     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1551     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1552     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1553 
1554     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1555     /* multiply add with matrix transpose */
1556     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1557       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1558       /* here we need to be careful to set the number of rows in the multiply to the
1559          number of compressed rows in the matrix ... which is equivalent to the
1560          size of the workVector */
1561       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1562                                mat->num_rows, mat->num_cols,
1563                                mat->num_entries, matstructT->alpha, matstructT->descr,
1564                                mat->values->data().get(), mat->row_offsets->data().get(),
1565                                mat->column_indices->data().get(), xarray, beta,
1566                                dptr);CHKERRCUSPARSE(stat);
1567     } else {
1568       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1569       if (cusparsestruct->workVector->size()) {
1570         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1571                                  matstructT->alpha, matstructT->descr, hybMat,
1572                                  xarray, beta,
1573                                  dptr);CHKERRCUSPARSE(stat);
1574       }
1575     }
1576     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1577 
1578     if (dptr != zarray) {
1579       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1580     } else if (zz != yy) {
1581       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1582     }
1583     /* scatter the data from the temporary into the full vector with a += operation */
1584     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1585     if (dptr != zarray) {
1586       thrust::device_ptr<PetscScalar> zptr;
1587 
1588       zptr = thrust::device_pointer_cast(zarray);
1589 
1590       /* scatter the data from the temporary into the full vector with a += operation */
1591       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1592                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1593                        VecCUDAPlusEquals());
1594     }
1595     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1596     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1597     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1598   } catch(char *ex) {
1599     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1600   }
1601   cerr = WaitForGPU();CHKERRCUDA(cerr); /* is this needed? just for yy==0 in Mult */
1602   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1607 {
1608   PetscErrorCode ierr;
1609 
1610   PetscFunctionBegin;
1611   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1612   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1613   if (A->factortype == MAT_FACTOR_NONE) {
1614     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1615   }
1616   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1617   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1618   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1619   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1620   PetscFunctionReturn(0);
1621 }
1622 
1623 /* --------------------------------------------------------------------------------*/
1624 /*@
1625    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1626    (the default parallel PETSc format). This matrix will ultimately pushed down
1627    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1628    assembly performance the user should preallocate the matrix storage by setting
1629    the parameter nz (or the array nnz).  By setting these parameters accurately,
1630    performance during matrix assembly can be increased by more than a factor of 50.
1631 
1632    Collective
1633 
1634    Input Parameters:
1635 +  comm - MPI communicator, set to PETSC_COMM_SELF
1636 .  m - number of rows
1637 .  n - number of columns
1638 .  nz - number of nonzeros per row (same for all rows)
1639 -  nnz - array containing the number of nonzeros in the various rows
1640          (possibly different for each row) or NULL
1641 
1642    Output Parameter:
1643 .  A - the matrix
1644 
1645    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1646    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1647    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1648 
1649    Notes:
1650    If nnz is given then nz is ignored
1651 
1652    The AIJ format (also called the Yale sparse matrix format or
1653    compressed row storage), is fully compatible with standard Fortran 77
1654    storage.  That is, the stored row and column indices can begin at
1655    either one (as in Fortran) or zero.  See the users' manual for details.
1656 
1657    Specify the preallocated storage with either nz or nnz (not both).
1658    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1659    allocation.  For large problems you MUST preallocate memory or you
1660    will get TERRIBLE performance, see the users' manual chapter on matrices.
1661 
1662    By default, this format uses inodes (identical nodes) when possible, to
1663    improve numerical efficiency of matrix-vector products and solves. We
1664    search for consecutive rows with the same nonzero structure, thereby
1665    reusing matrix information to achieve increased efficiency.
1666 
1667    Level: intermediate
1668 
1669 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1670 @*/
1671 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1672 {
1673   PetscErrorCode ierr;
1674 
1675   PetscFunctionBegin;
1676   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1677   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1678   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1679   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1680   PetscFunctionReturn(0);
1681 }
1682 
1683 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1684 {
1685   PetscErrorCode   ierr;
1686 
1687   PetscFunctionBegin;
1688   if (A->factortype==MAT_FACTOR_NONE) {
1689     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1690       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1691     }
1692   } else {
1693     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1694   }
1695   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1696   PetscFunctionReturn(0);
1697 }
1698 
1699 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1700 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1701 {
1702   PetscErrorCode ierr;
1703   Mat C;
1704   cusparseStatus_t stat;
1705   cusparseHandle_t handle=0;
1706 
1707   PetscFunctionBegin;
1708   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1709   C    = *B;
1710   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1711   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1712 
1713   /* inject CUSPARSE-specific stuff */
1714   if (C->factortype==MAT_FACTOR_NONE) {
1715     /* you cannot check the inode.use flag here since the matrix was just created.
1716        now build a GPU matrix data structure */
1717     C->spptr = new Mat_SeqAIJCUSPARSE;
1718     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1719     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1720     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1721     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1722     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1723     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1724     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1725     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1726   } else {
1727     /* NEXT, set the pointers to the triangular factors */
1728     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1729     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1731     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1733     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1737     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1738     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1739     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1740   }
1741 
1742   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1743   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1744   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1745   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1746   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1747   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1748   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1749   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1750   C->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
1751 
1752   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1753 
1754   C->boundtocpu  = PETSC_FALSE;
1755   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1756 
1757   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1758   PetscFunctionReturn(0);
1759 }
1760 
1761 
1762 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1763 {
1764   PetscErrorCode ierr;
1765 
1766   PetscFunctionBegin;
1767   /* Currently, there is case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1768      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1769      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1770      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1771            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1772   if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
1773   if (flg) {
1774     A->ops->mult             = MatMult_SeqAIJ;
1775     A->ops->multadd          = MatMultAdd_SeqAIJ;
1776     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1777     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1778     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
1779     A->ops->duplicate        = MatDuplicate_SeqAIJ;
1780   } else {
1781     A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1782     A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1783     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1784     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1785     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1786     A->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1787     A->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1788   }
1789   A->boundtocpu = flg;
1790   PetscFunctionReturn(0);
1791 }
1792 
1793 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1794 {
1795   PetscErrorCode ierr;
1796   cusparseStatus_t stat;
1797   cusparseHandle_t handle=0;
1798 
1799   PetscFunctionBegin;
1800   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1801   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1802 
1803   if (B->factortype==MAT_FACTOR_NONE) {
1804     /* you cannot check the inode.use flag here since the matrix was just created.
1805        now build a GPU matrix data structure */
1806     B->spptr = new Mat_SeqAIJCUSPARSE;
1807     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1808     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1809     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1810     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1811     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1812     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1813     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1814     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1815   } else {
1816     /* NEXT, set the pointers to the triangular factors */
1817     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1818     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1819     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1820     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1821     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1822     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1823     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1824     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1825     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1826     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1827     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1828   }
1829 
1830   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1831   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1832   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1833   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1834   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1835   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1836   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1837   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1838   B->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
1839 
1840   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1841 
1842   B->boundtocpu  = PETSC_FALSE;
1843   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1844 
1845   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1846   PetscFunctionReturn(0);
1847 }
1848 
1849 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1850 {
1851   PetscErrorCode ierr;
1852 
1853   PetscFunctionBegin;
1854   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1855   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1856   PetscFunctionReturn(0);
1857 }
1858 
1859 /*MC
1860    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1861 
1862    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1863    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1864    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1865 
1866    Options Database Keys:
1867 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1868 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1869 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1870 
1871   Level: beginner
1872 
1873 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1874 M*/
1875 
1876 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1877 
1878 
1879 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1880 {
1881   PetscErrorCode ierr;
1882 
1883   PetscFunctionBegin;
1884   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1885   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1886   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1887   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1888   PetscFunctionReturn(0);
1889 }
1890 
1891 
1892 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1893 {
1894   cusparseStatus_t stat;
1895   cusparseHandle_t handle;
1896 
1897   PetscFunctionBegin;
1898   if (*cusparsestruct) {
1899     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1900     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1901     delete (*cusparsestruct)->workVector;
1902     if (handle = (*cusparsestruct)->handle) {
1903       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1904     }
1905     delete *cusparsestruct;
1906     *cusparsestruct = 0;
1907   }
1908   PetscFunctionReturn(0);
1909 }
1910 
1911 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1912 {
1913   PetscFunctionBegin;
1914   if (*mat) {
1915     delete (*mat)->values;
1916     delete (*mat)->column_indices;
1917     delete (*mat)->row_offsets;
1918     delete *mat;
1919     *mat = 0;
1920   }
1921   PetscFunctionReturn(0);
1922 }
1923 
1924 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1925 {
1926   cusparseStatus_t stat;
1927   PetscErrorCode   ierr;
1928 
1929   PetscFunctionBegin;
1930   if (*trifactor) {
1931     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
1932     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
1933     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1934     delete *trifactor;
1935     *trifactor = 0;
1936   }
1937   PetscFunctionReturn(0);
1938 }
1939 
1940 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1941 {
1942   CsrMatrix        *mat;
1943   cusparseStatus_t stat;
1944   cudaError_t      err;
1945 
1946   PetscFunctionBegin;
1947   if (*matstruct) {
1948     if ((*matstruct)->mat) {
1949       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1950         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1951         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
1952       } else {
1953         mat = (CsrMatrix*)(*matstruct)->mat;
1954         CsrMatrix_Destroy(&mat);
1955       }
1956     }
1957     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
1958     delete (*matstruct)->cprowIndices;
1959     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1960     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1961     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1962     delete *matstruct;
1963     *matstruct = 0;
1964   }
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1969 {
1970   cusparseHandle_t handle;
1971   cusparseStatus_t stat;
1972 
1973   PetscFunctionBegin;
1974   if (*trifactors) {
1975     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1976     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1977     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1978     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1979     delete (*trifactors)->rpermIndices;
1980     delete (*trifactors)->cpermIndices;
1981     delete (*trifactors)->workVector;
1982     if (handle = (*trifactors)->handle) {
1983       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1984     }
1985     delete *trifactors;
1986     *trifactors = 0;
1987   }
1988   PetscFunctionReturn(0);
1989 }
1990 
1991