xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 6a98f8dc3f2c9149905a87dc2e9d0fedaf64e09a)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42 
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55 {
56   cusparseStatus_t   stat;
57   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58 
59   PetscFunctionBegin;
60   if (cusparsestruct->handle != handle) {
61     if (cusparsestruct->handle) {
62       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
63     }
64     cusparsestruct->handle = handle;
65   }
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80 {
81   PetscFunctionBegin;
82   *type = MATSOLVERCUSPARSE;
83   PetscFunctionReturn(0);
84 }
85 
86 /*MC
87   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92   algorithms are not recommended. This class does NOT support direct solver operations.
93 
94   Level: beginner
95 
96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97 M*/
98 
99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100 {
101   PetscErrorCode ierr;
102   PetscInt       n = A->rmap->n;
103 
104   PetscFunctionBegin;
105   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
106   (*B)->factortype = ftype;
107   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
108   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
109 
110   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
112     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
118 
119   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
120   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   if (A->factortype==MAT_FACTOR_NONE) {
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
183     }
184   }
185   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187   if (flg) {
188     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248   cudaError_t                       cerr;
249 
250   PetscFunctionBegin;
251   if (!n) PetscFunctionReturn(0);
252   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353   cudaError_t                       cerr;
354 
355   PetscFunctionBegin;
356   if (!n) PetscFunctionReturn(0);
357   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358     try {
359       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360       nzUpper = adiag[0]-adiag[n];
361 
362       /* Allocate Space for the upper triangular matrix */
363       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
366 
367       /* Fill the upper triangular matrix */
368       AiUp[0]=(PetscInt) 0;
369       AiUp[n]=nzUpper;
370       offset = nzUpper;
371       for (i=n-1; i>=0; i--) {
372         v  = aa + adiag[i+1] + 1;
373         vi = aj + adiag[i+1] + 1;
374 
375         /* number of elements NOT on the diagonal */
376         nz = adiag[i] - adiag[i+1]-1;
377 
378         /* decrement the offset */
379         offset -= (nz+1);
380 
381         /* first, set the diagonal elements */
382         AjUp[offset] = (PetscInt) i;
383         AAUp[offset] = (MatScalar)1./v[nz];
384         AiUp[i]      = AiUp[i+1] - (nz+1);
385 
386         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
387         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
388       }
389 
390       /* allocate space for the triangular factor information */
391       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
392 
393       /* Create the matrix description */
394       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
399 
400       /* Create the solve analysis information */
401       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       upTriFactor->csrMat = new CsrMatrix;
408       upTriFactor->csrMat->num_rows = n;
409       upTriFactor->csrMat->num_cols = n;
410       upTriFactor->csrMat->num_entries = nzUpper;
411 
412       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
414 
415       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
417 
418       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
420 
421       /* perform the solve analysis */
422       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
426 
427       /* assign the pointer. Is this really necessary? */
428       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
429 
430       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
434     } catch(char *ex) {
435       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436     }
437   }
438   PetscFunctionReturn(0);
439 }
440 
441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442 {
443   PetscErrorCode               ierr;
444   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446   IS                           isrow = a->row,iscol = a->icol;
447   PetscBool                    row_identity,col_identity;
448   const PetscInt               *r,*c;
449   PetscInt                     n = A->rmap->n;
450 
451   PetscFunctionBegin;
452   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
453   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
454 
455   cusparseTriFactors->workVector = new THRUSTARRAY(n);
456   cusparseTriFactors->nnz=a->nz;
457 
458   A->offloadmask = PETSC_OFFLOAD_BOTH;
459   /* lower triangular indices */
460   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
461   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
462   if (!row_identity) {
463     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
464     cusparseTriFactors->rpermIndices->assign(r, r+n);
465   }
466   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
467 
468   /* upper triangular indices */
469   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
470   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
471   if (!col_identity) {
472     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
473     cusparseTriFactors->cpermIndices->assign(c, c+n);
474   }
475 
476   if (!row_identity && !col_identity) {
477     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!row_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   } else if(!col_identity) {
481     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
482   }
483 
484   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
485   PetscFunctionReturn(0);
486 }
487 
488 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
489 {
490   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
491   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
492   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
493   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
494   cusparseStatus_t                  stat;
495   PetscErrorCode                    ierr;
496   cudaError_t                       cerr;
497   PetscInt                          *AiUp, *AjUp;
498   PetscScalar                       *AAUp;
499   PetscScalar                       *AALo;
500   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
501   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
502   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
503   const MatScalar                   *aa = b->a,*v;
504 
505   PetscFunctionBegin;
506   if (!n) PetscFunctionReturn(0);
507   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
508     try {
509       /* Allocate Space for the upper triangular matrix */
510       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
511       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
512       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
513       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
514 
515       /* Fill the upper triangular matrix */
516       AiUp[0]=(PetscInt) 0;
517       AiUp[n]=nzUpper;
518       offset = 0;
519       for (i=0; i<n; i++) {
520         /* set the pointers */
521         v  = aa + ai[i];
522         vj = aj + ai[i];
523         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
524 
525         /* first, set the diagonal elements */
526         AjUp[offset] = (PetscInt) i;
527         AAUp[offset] = (MatScalar)1.0/v[nz];
528         AiUp[i]      = offset;
529         AALo[offset] = (MatScalar)1.0/v[nz];
530 
531         offset+=1;
532         if (nz>0) {
533           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
534           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
535           for (j=offset; j<offset+nz; j++) {
536             AAUp[j] = -AAUp[j];
537             AALo[j] = AAUp[j]/v[nz];
538           }
539           offset+=nz;
540         }
541       }
542 
543       /* allocate space for the triangular factor information */
544       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
545 
546       /* Create the matrix description */
547       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
548       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
549       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
550       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
551       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
552 
553       /* Create the solve analysis information */
554       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
555 
556       /* set the operation */
557       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
558 
559       /* set the matrix */
560       upTriFactor->csrMat = new CsrMatrix;
561       upTriFactor->csrMat->num_rows = A->rmap->n;
562       upTriFactor->csrMat->num_cols = A->cmap->n;
563       upTriFactor->csrMat->num_entries = a->nz;
564 
565       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
566       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
567 
568       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
569       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
570 
571       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
572       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
573 
574       /* perform the solve analysis */
575       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
576                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
577                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
578                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
579 
580       /* assign the pointer. Is this really necessary? */
581       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
582 
583       /* allocate space for the triangular factor information */
584       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
585 
586       /* Create the matrix description */
587       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
588       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
589       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
590       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
591       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
592 
593       /* Create the solve analysis information */
594       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
595 
596       /* set the operation */
597       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
598 
599       /* set the matrix */
600       loTriFactor->csrMat = new CsrMatrix;
601       loTriFactor->csrMat->num_rows = A->rmap->n;
602       loTriFactor->csrMat->num_cols = A->cmap->n;
603       loTriFactor->csrMat->num_entries = a->nz;
604 
605       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
606       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
607 
608       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
610 
611       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
612       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
613       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
614 
615       /* perform the solve analysis */
616       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
617                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
618                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
619                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
620 
621       /* assign the pointer. Is this really necessary? */
622       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
623 
624       A->offloadmask = PETSC_OFFLOAD_BOTH;
625       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
626       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
627       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
628       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
629     } catch(char *ex) {
630       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631     }
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
637 {
638   PetscErrorCode               ierr;
639   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641   IS                           ip = a->row;
642   const PetscInt               *rip;
643   PetscBool                    perm_identity;
644   PetscInt                     n = A->rmap->n;
645 
646   PetscFunctionBegin;
647   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
648   cusparseTriFactors->workVector = new THRUSTARRAY(n);
649   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
650 
651   /* lower triangular indices */
652   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
653   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
654   if (!perm_identity) {
655     IS             iip;
656     const PetscInt *irip;
657 
658     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
659     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
660     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
662     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
664     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
665     ierr = ISDestroy(&iip);CHKERRQ(ierr);
666     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
667   }
668   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
669   PetscFunctionReturn(0);
670 }
671 
672 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
673 {
674   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
675   IS             isrow = b->row,iscol = b->col;
676   PetscBool      row_identity,col_identity;
677   PetscErrorCode ierr;
678 
679   PetscFunctionBegin;
680   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
681   /* determine which version of MatSolve needs to be used. */
682   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
683   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
684   if (row_identity && col_identity) {
685     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
686     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
687     B->ops->matsolve = NULL;
688     B->ops->matsolvetranspose = NULL;
689   } else {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
692     B->ops->matsolve = NULL;
693     B->ops->matsolvetranspose = NULL;
694   }
695 
696   /* get the triangular factors */
697   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
698   PetscFunctionReturn(0);
699 }
700 
701 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
702 {
703   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
704   IS             ip = b->row;
705   PetscBool      perm_identity;
706   PetscErrorCode ierr;
707 
708   PetscFunctionBegin;
709   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
710 
711   /* determine which version of MatSolve needs to be used. */
712   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
713   if (perm_identity) {
714     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
715     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
716     B->ops->matsolve = NULL;
717     B->ops->matsolvetranspose = NULL;
718   } else {
719     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
720     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
721     B->ops->matsolve = NULL;
722     B->ops->matsolvetranspose = NULL;
723   }
724 
725   /* get the triangular factors */
726   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
731 {
732   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
733   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
734   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
736   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
737   cusparseStatus_t                  stat;
738   cusparseIndexBase_t               indexBase;
739   cusparseMatrixType_t              matrixType;
740   cusparseFillMode_t                fillMode;
741   cusparseDiagType_t                diagType;
742 
743   PetscFunctionBegin;
744 
745   /*********************************************/
746   /* Now the Transpose of the Lower Tri Factor */
747   /*********************************************/
748 
749   /* allocate space for the transpose of the lower triangular factor */
750   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
751 
752   /* set the matrix descriptors of the lower triangular factor */
753   matrixType = cusparseGetMatType(loTriFactor->descr);
754   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
755   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
756     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
757   diagType = cusparseGetMatDiagType(loTriFactor->descr);
758 
759   /* Create the matrix description */
760   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
761   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
762   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
763   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
764   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
765 
766   /* Create the solve analysis information */
767   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
768 
769   /* set the operation */
770   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
771 
772   /* allocate GPU space for the CSC of the lower triangular factor*/
773   loTriFactorT->csrMat = new CsrMatrix;
774   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
775   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
776   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
777   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
778   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
779   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
780 
781   /* compute the transpose of the lower triangular factor, i.e. the CSC */
782   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
783                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
784                           loTriFactor->csrMat->values->data().get(),
785                           loTriFactor->csrMat->row_offsets->data().get(),
786                           loTriFactor->csrMat->column_indices->data().get(),
787                           loTriFactorT->csrMat->values->data().get(),
788                           loTriFactorT->csrMat->column_indices->data().get(),
789                           loTriFactorT->csrMat->row_offsets->data().get(),
790                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
791 
792   /* perform the solve analysis on the transposed matrix */
793   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
794                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
795                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
796                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
797                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
798 
799   /* assign the pointer. Is this really necessary? */
800   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
801 
802   /*********************************************/
803   /* Now the Transpose of the Upper Tri Factor */
804   /*********************************************/
805 
806   /* allocate space for the transpose of the upper triangular factor */
807   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
808 
809   /* set the matrix descriptors of the upper triangular factor */
810   matrixType = cusparseGetMatType(upTriFactor->descr);
811   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
812   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
813     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
814   diagType = cusparseGetMatDiagType(upTriFactor->descr);
815 
816   /* Create the matrix description */
817   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
818   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
819   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
820   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
821   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
822 
823   /* Create the solve analysis information */
824   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
825 
826   /* set the operation */
827   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
828 
829   /* allocate GPU space for the CSC of the upper triangular factor*/
830   upTriFactorT->csrMat = new CsrMatrix;
831   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
832   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
833   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
834   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
835   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
836   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
837 
838   /* compute the transpose of the upper triangular factor, i.e. the CSC */
839   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
840                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
841                           upTriFactor->csrMat->values->data().get(),
842                           upTriFactor->csrMat->row_offsets->data().get(),
843                           upTriFactor->csrMat->column_indices->data().get(),
844                           upTriFactorT->csrMat->values->data().get(),
845                           upTriFactorT->csrMat->column_indices->data().get(),
846                           upTriFactorT->csrMat->row_offsets->data().get(),
847                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
848 
849   /* perform the solve analysis on the transposed matrix */
850   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
851                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
852                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
853                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
854                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
855 
856   /* assign the pointer. Is this really necessary? */
857   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
858   PetscFunctionReturn(0);
859 }
860 
861 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
862 {
863   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
864   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
865   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
866   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
867   cusparseStatus_t             stat;
868   cusparseIndexBase_t          indexBase;
869   cudaError_t                  err;
870   PetscErrorCode               ierr;
871 
872   PetscFunctionBegin;
873 
874   /* allocate space for the triangular factor information */
875   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
876   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
877   indexBase = cusparseGetMatIndexBase(matstruct->descr);
878   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
879   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
880 
881   /* set alpha and beta */
882   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
883   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
884   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
885   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
886   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
887   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
888   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
889 
890   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892     CsrMatrix *matrixT= new CsrMatrix;
893     matrixT->num_rows = A->cmap->n;
894     matrixT->num_cols = A->rmap->n;
895     matrixT->num_entries = a->nz;
896     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
897     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898     matrixT->values = new THRUSTARRAY(a->nz);
899 
900     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
901     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
902     /* compute the transpose, i.e. the CSC */
903     indexBase = cusparseGetMatIndexBase(matstruct->descr);
904     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
905                             A->cmap->n, matrix->num_entries,
906                             matrix->values->data().get(),
907                             cusparsestruct->rowoffsets_gpu->data().get(),
908                             matrix->column_indices->data().get(),
909                             matrixT->values->data().get(),
910                             matrixT->column_indices->data().get(),
911                             matrixT->row_offsets->data().get(),
912                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
913     /* assign the pointer */
914     matstructT->mat = matrixT;
915     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
916   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
917     /* First convert HYB to CSR */
918     CsrMatrix *temp= new CsrMatrix;
919     temp->num_rows = A->rmap->n;
920     temp->num_cols = A->cmap->n;
921     temp->num_entries = a->nz;
922     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
923     temp->column_indices = new THRUSTINTARRAY32(a->nz);
924     temp->values = new THRUSTARRAY(a->nz);
925 
926 
927     stat = cusparse_hyb2csr(cusparsestruct->handle,
928                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
929                             temp->values->data().get(),
930                             temp->row_offsets->data().get(),
931                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
932 
933     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
934     CsrMatrix *tempT= new CsrMatrix;
935     tempT->num_rows = A->rmap->n;
936     tempT->num_cols = A->cmap->n;
937     tempT->num_entries = a->nz;
938     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
940     tempT->values = new THRUSTARRAY(a->nz);
941 
942     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
943                             temp->num_cols, temp->num_entries,
944                             temp->values->data().get(),
945                             temp->row_offsets->data().get(),
946                             temp->column_indices->data().get(),
947                             tempT->values->data().get(),
948                             tempT->column_indices->data().get(),
949                             tempT->row_offsets->data().get(),
950                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
951 
952     /* Last, convert CSC to HYB */
953     cusparseHybMat_t hybMat;
954     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
955     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
956       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
957     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
958                             matstructT->descr, tempT->values->data().get(),
959                             tempT->row_offsets->data().get(),
960                             tempT->column_indices->data().get(),
961                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
962 
963     /* assign the pointer */
964     matstructT->mat = hybMat;
965     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
966 
967     /* delete temporaries */
968     if (tempT) {
969       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
970       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
971       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
972       delete (CsrMatrix*) tempT;
973     }
974     if (temp) {
975       if (temp->values) delete (THRUSTARRAY*) temp->values;
976       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
977       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
978       delete (CsrMatrix*) temp;
979     }
980   }
981   /* assign the compressed row indices */
982   matstructT->cprowIndices = new THRUSTINTARRAY;
983   matstructT->cprowIndices->resize(A->cmap->n);
984   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
985   /* assign the pointer */
986   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
987   PetscFunctionReturn(0);
988 }
989 
990 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
991 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
992 {
993   PetscInt                              n = xx->map->n;
994   const PetscScalar                     *barray;
995   PetscScalar                           *xarray;
996   thrust::device_ptr<const PetscScalar> bGPU;
997   thrust::device_ptr<PetscScalar>       xGPU;
998   cusparseStatus_t                      stat;
999   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1001   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1002   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1003   PetscErrorCode                        ierr;
1004   cudaError_t                           cerr;
1005 
1006   PetscFunctionBegin;
1007   /* Analyze the matrix and create the transpose ... on the fly */
1008   if (!loTriFactorT && !upTriFactorT) {
1009     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1010     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1011     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1012   }
1013 
1014   /* Get the GPU pointers */
1015   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1016   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1017   xGPU = thrust::device_pointer_cast(xarray);
1018   bGPU = thrust::device_pointer_cast(barray);
1019 
1020   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1021   /* First, reorder with the row permutation */
1022   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1023                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1024                xGPU);
1025 
1026   /* First, solve U */
1027   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1028                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1029                         upTriFactorT->csrMat->values->data().get(),
1030                         upTriFactorT->csrMat->row_offsets->data().get(),
1031                         upTriFactorT->csrMat->column_indices->data().get(),
1032                         upTriFactorT->solveInfo,
1033                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1034 
1035   /* Then, solve L */
1036   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1037                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1038                         loTriFactorT->csrMat->values->data().get(),
1039                         loTriFactorT->csrMat->row_offsets->data().get(),
1040                         loTriFactorT->csrMat->column_indices->data().get(),
1041                         loTriFactorT->solveInfo,
1042                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1043 
1044   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1045   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1046                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1047                tempGPU->begin());
1048 
1049   /* Copy the temporary to the full solution. */
1050   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1051 
1052   /* restore */
1053   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1054   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1055   cerr = WaitForGPU();CHKERRCUDA(cerr);
1056   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1057   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1058   PetscFunctionReturn(0);
1059 }
1060 
1061 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1062 {
1063   const PetscScalar                 *barray;
1064   PetscScalar                       *xarray;
1065   cusparseStatus_t                  stat;
1066   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1067   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1068   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1069   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1070   PetscErrorCode                    ierr;
1071   cudaError_t                       cerr;
1072 
1073   PetscFunctionBegin;
1074   /* Analyze the matrix and create the transpose ... on the fly */
1075   if (!loTriFactorT && !upTriFactorT) {
1076     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1077     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079   }
1080 
1081   /* Get the GPU pointers */
1082   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1083   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1084 
1085   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1086   /* First, solve U */
1087   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1088                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1089                         upTriFactorT->csrMat->values->data().get(),
1090                         upTriFactorT->csrMat->row_offsets->data().get(),
1091                         upTriFactorT->csrMat->column_indices->data().get(),
1092                         upTriFactorT->solveInfo,
1093                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1094 
1095   /* Then, solve L */
1096   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1097                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1098                         loTriFactorT->csrMat->values->data().get(),
1099                         loTriFactorT->csrMat->row_offsets->data().get(),
1100                         loTriFactorT->csrMat->column_indices->data().get(),
1101                         loTriFactorT->solveInfo,
1102                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1103 
1104   /* restore */
1105   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1106   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1107   cerr = WaitForGPU();CHKERRCUDA(cerr);
1108   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1109   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1110   PetscFunctionReturn(0);
1111 }
1112 
1113 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1114 {
1115   const PetscScalar                     *barray;
1116   PetscScalar                           *xarray;
1117   thrust::device_ptr<const PetscScalar> bGPU;
1118   thrust::device_ptr<PetscScalar>       xGPU;
1119   cusparseStatus_t                      stat;
1120   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1121   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1122   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1123   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1124   PetscErrorCode                        ierr;
1125   cudaError_t                           cerr;
1126 
1127   PetscFunctionBegin;
1128 
1129   /* Get the GPU pointers */
1130   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1131   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1132   xGPU = thrust::device_pointer_cast(xarray);
1133   bGPU = thrust::device_pointer_cast(barray);
1134 
1135   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1136   /* First, reorder with the row permutation */
1137   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1138                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1139                tempGPU->begin());
1140 
1141   /* Next, solve L */
1142   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1143                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1144                         loTriFactor->csrMat->values->data().get(),
1145                         loTriFactor->csrMat->row_offsets->data().get(),
1146                         loTriFactor->csrMat->column_indices->data().get(),
1147                         loTriFactor->solveInfo,
1148                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1149 
1150   /* Then, solve U */
1151   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1152                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1153                         upTriFactor->csrMat->values->data().get(),
1154                         upTriFactor->csrMat->row_offsets->data().get(),
1155                         upTriFactor->csrMat->column_indices->data().get(),
1156                         upTriFactor->solveInfo,
1157                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1158 
1159   /* Last, reorder with the column permutation */
1160   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1161                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1162                xGPU);
1163 
1164   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1165   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1166   cerr = WaitForGPU();CHKERRCUDA(cerr);
1167   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1168   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1173 {
1174   const PetscScalar                 *barray;
1175   PetscScalar                       *xarray;
1176   cusparseStatus_t                  stat;
1177   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1178   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1179   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1180   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1181   PetscErrorCode                    ierr;
1182   cudaError_t                       cerr;
1183 
1184   PetscFunctionBegin;
1185   /* Get the GPU pointers */
1186   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1187   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1188 
1189   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1190   /* First, solve L */
1191   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1192                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1193                         loTriFactor->csrMat->values->data().get(),
1194                         loTriFactor->csrMat->row_offsets->data().get(),
1195                         loTriFactor->csrMat->column_indices->data().get(),
1196                         loTriFactor->solveInfo,
1197                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1198 
1199   /* Next, solve U */
1200   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1201                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1202                         upTriFactor->csrMat->values->data().get(),
1203                         upTriFactor->csrMat->row_offsets->data().get(),
1204                         upTriFactor->csrMat->column_indices->data().get(),
1205                         upTriFactor->solveInfo,
1206                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1207 
1208   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1209   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1210   cerr = WaitForGPU();CHKERRCUDA(cerr);
1211   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1212   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1213   PetscFunctionReturn(0);
1214 }
1215 
1216 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1217 {
1218   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1219   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1220   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1221   PetscInt                     m = A->rmap->n,*ii,*ridx;
1222   PetscErrorCode               ierr;
1223   cusparseStatus_t             stat;
1224   cudaError_t                  err;
1225 
1226   PetscFunctionBegin;
1227   if (A->boundtocpu) PetscFunctionReturn(0);
1228   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1229     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1230     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1231       /* Copy values only */
1232       CsrMatrix *mat,*matT;
1233       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1234       mat->values->assign(a->a, a->a+a->nz);
1235       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1236 
1237       /* Update matT when it was built before */
1238       if (cusparsestruct->matTranspose) {
1239         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1240         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1241         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1242                                  A->cmap->n, mat->num_entries,
1243                                  mat->values->data().get(),
1244                                  cusparsestruct->rowoffsets_gpu->data().get(),
1245                                  mat->column_indices->data().get(),
1246                                  matT->values->data().get(),
1247                                  matT->column_indices->data().get(),
1248                                  matT->row_offsets->data().get(),
1249                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1250       }
1251     } else {
1252       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1253       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1254       delete cusparsestruct->workVector;
1255       delete cusparsestruct->rowoffsets_gpu;
1256       try {
1257         cusparsestruct->nonzerorow=0;
1258         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1259 
1260         if (a->compressedrow.use) {
1261           m    = a->compressedrow.nrows;
1262           ii   = a->compressedrow.i;
1263           ridx = a->compressedrow.rindex;
1264         } else {
1265           /* Forcing compressed row on the GPU */
1266           int k=0;
1267           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1268           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1269           ii[0]=0;
1270           for (int j = 0; j<m; j++) {
1271             if ((a->i[j+1]-a->i[j])>0) {
1272               ii[k]  = a->i[j];
1273               ridx[k]= j;
1274               k++;
1275             }
1276           }
1277           ii[cusparsestruct->nonzerorow] = a->nz;
1278           m = cusparsestruct->nonzerorow;
1279         }
1280 
1281         /* allocate space for the triangular factor information */
1282         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1283         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1284         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1285         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1286 
1287         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1288         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1289         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1290         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1291         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1292         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1293         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1294 
1295         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1296         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1297           /* set the matrix */
1298           CsrMatrix *matrix= new CsrMatrix;
1299           matrix->num_rows = m;
1300           matrix->num_cols = A->cmap->n;
1301           matrix->num_entries = a->nz;
1302           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1303           matrix->row_offsets->assign(ii, ii + m+1);
1304 
1305           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1306           matrix->column_indices->assign(a->j, a->j+a->nz);
1307 
1308           matrix->values = new THRUSTARRAY(a->nz);
1309           matrix->values->assign(a->a, a->a+a->nz);
1310 
1311           /* assign the pointer */
1312           matstruct->mat = matrix;
1313 
1314         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1315           CsrMatrix *matrix= new CsrMatrix;
1316           matrix->num_rows = m;
1317           matrix->num_cols = A->cmap->n;
1318           matrix->num_entries = a->nz;
1319           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1320           matrix->row_offsets->assign(ii, ii + m+1);
1321 
1322           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1323           matrix->column_indices->assign(a->j, a->j+a->nz);
1324 
1325           matrix->values = new THRUSTARRAY(a->nz);
1326           matrix->values->assign(a->a, a->a+a->nz);
1327 
1328           cusparseHybMat_t hybMat;
1329           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1330           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1331             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1332           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1333               matstruct->descr, matrix->values->data().get(),
1334               matrix->row_offsets->data().get(),
1335               matrix->column_indices->data().get(),
1336               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1337           /* assign the pointer */
1338           matstruct->mat = hybMat;
1339 
1340           if (matrix) {
1341             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1342             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1343             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1344             delete (CsrMatrix*)matrix;
1345           }
1346         }
1347 
1348         /* assign the compressed row indices */
1349         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1350         matstruct->cprowIndices->assign(ridx,ridx+m);
1351         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1352 
1353         /* assign the pointer */
1354         cusparsestruct->mat = matstruct;
1355 
1356         if (!a->compressedrow.use) {
1357           ierr = PetscFree(ii);CHKERRQ(ierr);
1358           ierr = PetscFree(ridx);CHKERRQ(ierr);
1359         }
1360         cusparsestruct->workVector = new THRUSTARRAY(m);
1361       } catch(char *ex) {
1362         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1363       }
1364       cusparsestruct->nonzerostate = A->nonzerostate;
1365     }
1366     err  = WaitForGPU();CHKERRCUDA(err);
1367     A->offloadmask = PETSC_OFFLOAD_BOTH;
1368     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1369   }
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 struct VecCUDAPlusEquals
1374 {
1375   template <typename Tuple>
1376   __host__ __device__
1377   void operator()(Tuple t)
1378   {
1379     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1380   }
1381 };
1382 
1383 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1384 {
1385   PetscErrorCode ierr;
1386 
1387   PetscFunctionBegin;
1388   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1393 {
1394   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1395   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1396   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1397   const PetscScalar            *xarray;
1398   PetscScalar                  *yarray;
1399   PetscErrorCode               ierr;
1400   cudaError_t                  cerr;
1401   cusparseStatus_t             stat;
1402 
1403   PetscFunctionBegin;
1404   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1405   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1406   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1407   if (!matstructT) {
1408     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1409     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1410   }
1411   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1412   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1413   if (yy->map->n) {
1414     PetscInt                     n = yy->map->n;
1415     cudaError_t                  err;
1416     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1417   }
1418 
1419   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1420   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1421     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1422     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1423                              mat->num_rows, mat->num_cols,
1424                              mat->num_entries, matstructT->alpha, matstructT->descr,
1425                              mat->values->data().get(), mat->row_offsets->data().get(),
1426                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1427                              yarray);CHKERRCUSPARSE(stat);
1428   } else {
1429     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1430     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1431                              matstructT->alpha, matstructT->descr, hybMat,
1432                              xarray, matstructT->beta_zero,
1433                              yarray);CHKERRCUSPARSE(stat);
1434   }
1435   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1436   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1437   cerr = WaitForGPU();CHKERRCUDA(cerr);
1438   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1439   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 
1444 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1445 {
1446   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1447   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1448   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1449   const PetscScalar            *xarray;
1450   PetscScalar                  *zarray,*dptr,*beta;
1451   PetscErrorCode               ierr;
1452   cudaError_t                  cerr;
1453   cusparseStatus_t             stat;
1454   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1455 
1456   PetscFunctionBegin;
1457   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1458   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1459   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1460   try {
1461     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1462     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1463     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1464       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1465     } else {
1466       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1467     }
1468     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1469     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1470 
1471     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1472     /* csr_spmv is multiply add */
1473     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1474       /* here we need to be careful to set the number of rows in the multiply to the
1475          number of compressed rows in the matrix ... which is equivalent to the
1476          size of the workVector */
1477       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1478       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1479                                mat->num_rows, mat->num_cols,
1480                                mat->num_entries, matstruct->alpha, matstruct->descr,
1481                                mat->values->data().get(), mat->row_offsets->data().get(),
1482                                mat->column_indices->data().get(), xarray, beta,
1483                                dptr);CHKERRCUSPARSE(stat);
1484     } else {
1485       if (cusparsestruct->workVector->size()) {
1486         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1487         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1488                                  matstruct->alpha, matstruct->descr, hybMat,
1489                                  xarray, beta,
1490                                  dptr);CHKERRCUSPARSE(stat);
1491       }
1492     }
1493     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1494 
1495     if (yy) {
1496       if (dptr != zarray) {
1497         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1498       } else if (zz != yy) {
1499         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1500       }
1501     } else if (dptr != zarray) {
1502       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1503     }
1504     /* scatter the data from the temporary into the full vector with a += operation */
1505     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1506     if (dptr != zarray) {
1507       thrust::device_ptr<PetscScalar> zptr;
1508 
1509       zptr = thrust::device_pointer_cast(zarray);
1510       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1511                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1512                        VecCUDAPlusEquals());
1513     }
1514     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1515     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1516     if (yy && !cmpr) {
1517       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1518     } else {
1519       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1520     }
1521   } catch(char *ex) {
1522     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1523   }
1524   cerr = WaitForGPU();CHKERRCUDA(cerr);
1525   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1530 {
1531   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1532   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1533   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1534   const PetscScalar               *xarray;
1535   PetscScalar                     *zarray,*beta;
1536   PetscErrorCode                  ierr;
1537   cudaError_t                     cerr;
1538   cusparseStatus_t                stat;
1539 
1540   PetscFunctionBegin;
1541   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1542   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1543   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544   if (!matstructT) {
1545     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1546     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1547   }
1548 
1549   /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1550   try {
1551     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1552     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1553     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1554     beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;
1555 
1556     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1557     /* multiply add with matrix transpose */
1558     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1559       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1560       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1561                                mat->num_rows, mat->num_cols,
1562                                mat->num_entries, matstructT->alpha, matstructT->descr,
1563                                mat->values->data().get(), mat->row_offsets->data().get(),
1564                                mat->column_indices->data().get(), xarray, beta,
1565                                zarray);CHKERRCUSPARSE(stat);
1566     } else {
1567       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1568       if (cusparsestruct->workVector->size()) {
1569         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1570                                  matstructT->alpha, matstructT->descr, hybMat,
1571                                  xarray, beta,
1572                                  zarray);CHKERRCUSPARSE(stat);
1573       }
1574     }
1575     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1576 
1577     if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);}
1578     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1579     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1580   } catch(char *ex) {
1581     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1582   }
1583   cerr = WaitForGPU();CHKERRCUDA(cerr);
1584   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1589 {
1590   PetscErrorCode ierr;
1591 
1592   PetscFunctionBegin;
1593   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1594   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1595   if (A->factortype == MAT_FACTOR_NONE) {
1596     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1597   }
1598   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1599   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1600   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1601   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 /* --------------------------------------------------------------------------------*/
1606 /*@
1607    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1608    (the default parallel PETSc format). This matrix will ultimately pushed down
1609    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1610    assembly performance the user should preallocate the matrix storage by setting
1611    the parameter nz (or the array nnz).  By setting these parameters accurately,
1612    performance during matrix assembly can be increased by more than a factor of 50.
1613 
1614    Collective
1615 
1616    Input Parameters:
1617 +  comm - MPI communicator, set to PETSC_COMM_SELF
1618 .  m - number of rows
1619 .  n - number of columns
1620 .  nz - number of nonzeros per row (same for all rows)
1621 -  nnz - array containing the number of nonzeros in the various rows
1622          (possibly different for each row) or NULL
1623 
1624    Output Parameter:
1625 .  A - the matrix
1626 
1627    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1628    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1629    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1630 
1631    Notes:
1632    If nnz is given then nz is ignored
1633 
1634    The AIJ format (also called the Yale sparse matrix format or
1635    compressed row storage), is fully compatible with standard Fortran 77
1636    storage.  That is, the stored row and column indices can begin at
1637    either one (as in Fortran) or zero.  See the users' manual for details.
1638 
1639    Specify the preallocated storage with either nz or nnz (not both).
1640    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1641    allocation.  For large problems you MUST preallocate memory or you
1642    will get TERRIBLE performance, see the users' manual chapter on matrices.
1643 
1644    By default, this format uses inodes (identical nodes) when possible, to
1645    improve numerical efficiency of matrix-vector products and solves. We
1646    search for consecutive rows with the same nonzero structure, thereby
1647    reusing matrix information to achieve increased efficiency.
1648 
1649    Level: intermediate
1650 
1651 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1652 @*/
1653 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1654 {
1655   PetscErrorCode ierr;
1656 
1657   PetscFunctionBegin;
1658   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1659   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1660   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1661   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1666 {
1667   PetscErrorCode   ierr;
1668 
1669   PetscFunctionBegin;
1670   if (A->factortype==MAT_FACTOR_NONE) {
1671     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1672       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1673     }
1674   } else {
1675     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1676   }
1677   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1682 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1683 {
1684   PetscErrorCode ierr;
1685   Mat C;
1686   cusparseStatus_t stat;
1687   cusparseHandle_t handle=0;
1688 
1689   PetscFunctionBegin;
1690   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1691   C    = *B;
1692   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1693   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1694 
1695   /* inject CUSPARSE-specific stuff */
1696   if (C->factortype==MAT_FACTOR_NONE) {
1697     /* you cannot check the inode.use flag here since the matrix was just created.
1698        now build a GPU matrix data structure */
1699     C->spptr = new Mat_SeqAIJCUSPARSE;
1700     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat            = 0;
1701     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose   = 0;
1702     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector     = 0;
1703     ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0;
1704     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format         = MAT_CUSPARSE_CSR;
1705     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream         = 0;
1706     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1707     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle         = handle;
1708     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate   = 0;
1709   } else {
1710     /* NEXT, set the pointers to the triangular factors */
1711     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1712     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1713     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1714     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1715     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1716     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1717     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1718     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1719     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1720     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1721     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1723   }
1724 
1725   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1726   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1727   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1728   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1729   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1730   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1731   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1732   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1733   C->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
1734 
1735   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1736 
1737   C->boundtocpu  = PETSC_FALSE;
1738   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1739 
1740   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 
1745 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1746 {
1747   PetscFunctionBegin;
1748   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1749      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1750      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1751      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1752            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1753   if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
1754   if (flg) {
1755     A->ops->mult             = MatMult_SeqAIJ;
1756     A->ops->multadd          = MatMultAdd_SeqAIJ;
1757     A->ops->multtranspose    = MatMultTranspose_SeqAIJ;
1758     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1759     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
1760     A->ops->duplicate        = MatDuplicate_SeqAIJ;
1761   } else {
1762     A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1763     A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1764     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1765     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1766     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1767     A->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1768     A->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1769   }
1770   A->boundtocpu = flg;
1771   PetscFunctionReturn(0);
1772 }
1773 
1774 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1775 {
1776   PetscErrorCode ierr;
1777   cusparseStatus_t stat;
1778   cusparseHandle_t handle=0;
1779 
1780   PetscFunctionBegin;
1781   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1782   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1783 
1784   if (B->factortype==MAT_FACTOR_NONE) {
1785     /* you cannot check the inode.use flag here since the matrix was just created.
1786        now build a GPU matrix data structure */
1787     B->spptr = new Mat_SeqAIJCUSPARSE;
1788     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1789     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1790     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1791     ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1792     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1793     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1794     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1795     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1796     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1797   } else {
1798     /* NEXT, set the pointers to the triangular factors */
1799     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1800     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1801     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1802     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1803     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1804     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1805     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1806     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1807     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1808     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1809     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1810   }
1811 
1812   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1813   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1814   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1815   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1816   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1817   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1818   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1819   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1820   B->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
1821 
1822   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1823 
1824   B->boundtocpu  = PETSC_FALSE;
1825   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1826 
1827   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1832 {
1833   PetscErrorCode ierr;
1834 
1835   PetscFunctionBegin;
1836   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1837   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 /*MC
1842    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1843 
1844    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1845    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1846    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1847 
1848    Options Database Keys:
1849 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1850 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1851 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1852 
1853   Level: beginner
1854 
1855 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1856 M*/
1857 
1858 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1859 
1860 
1861 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1862 {
1863   PetscErrorCode ierr;
1864 
1865   PetscFunctionBegin;
1866   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1867   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1868   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1869   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1870   PetscFunctionReturn(0);
1871 }
1872 
1873 
1874 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1875 {
1876   cusparseStatus_t stat;
1877   cusparseHandle_t handle;
1878 
1879   PetscFunctionBegin;
1880   if (*cusparsestruct) {
1881     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1882     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1883     delete (*cusparsestruct)->workVector;
1884     delete (*cusparsestruct)->rowoffsets_gpu;
1885     if (handle = (*cusparsestruct)->handle) {
1886       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1887     }
1888     delete *cusparsestruct;
1889     *cusparsestruct = 0;
1890   }
1891   PetscFunctionReturn(0);
1892 }
1893 
1894 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1895 {
1896   PetscFunctionBegin;
1897   if (*mat) {
1898     delete (*mat)->values;
1899     delete (*mat)->column_indices;
1900     delete (*mat)->row_offsets;
1901     delete *mat;
1902     *mat = 0;
1903   }
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1908 {
1909   cusparseStatus_t stat;
1910   PetscErrorCode   ierr;
1911 
1912   PetscFunctionBegin;
1913   if (*trifactor) {
1914     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
1915     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
1916     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1917     delete *trifactor;
1918     *trifactor = 0;
1919   }
1920   PetscFunctionReturn(0);
1921 }
1922 
1923 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1924 {
1925   CsrMatrix        *mat;
1926   cusparseStatus_t stat;
1927   cudaError_t      err;
1928 
1929   PetscFunctionBegin;
1930   if (*matstruct) {
1931     if ((*matstruct)->mat) {
1932       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1933         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1934         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
1935       } else {
1936         mat = (CsrMatrix*)(*matstruct)->mat;
1937         CsrMatrix_Destroy(&mat);
1938       }
1939     }
1940     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
1941     delete (*matstruct)->cprowIndices;
1942     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1943     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1944     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1945     delete *matstruct;
1946     *matstruct = 0;
1947   }
1948   PetscFunctionReturn(0);
1949 }
1950 
1951 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1952 {
1953   cusparseHandle_t handle;
1954   cusparseStatus_t stat;
1955 
1956   PetscFunctionBegin;
1957   if (*trifactors) {
1958     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1959     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1960     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1961     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1962     delete (*trifactors)->rpermIndices;
1963     delete (*trifactors)->cpermIndices;
1964     delete (*trifactors)->workVector;
1965     if (handle = (*trifactors)->handle) {
1966       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1967     }
1968     delete *trifactors;
1969     *trifactors = 0;
1970   }
1971   PetscFunctionReturn(0);
1972 }
1973 
1974