xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision f253e43cc674c8507d337bbb5ef3ec57f9e3fddc)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42 
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55 {
56   cusparseStatus_t   stat;
57   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58 
59   PetscFunctionBegin;
60   if (cusparsestruct->handle != handle) {
61     if (cusparsestruct->handle) {
62       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
63     }
64     cusparsestruct->handle = handle;
65   }
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80 {
81   PetscFunctionBegin;
82   *type = MATSOLVERCUSPARSE;
83   PetscFunctionReturn(0);
84 }
85 
86 /*MC
87   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92   algorithms are not recommended. This class does NOT support direct solver operations.
93 
94   Level: beginner
95 
96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97 M*/
98 
99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100 {
101   PetscErrorCode ierr;
102   PetscInt       n = A->rmap->n;
103 
104   PetscFunctionBegin;
105   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
106   (*B)->factortype = ftype;
107   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
108   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
109 
110   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
112     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
118 
119   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
120   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   if (A->factortype==MAT_FACTOR_NONE) {
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
183     }
184   }
185   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187   if (flg) {
188     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248   cudaError_t                       cerr;
249 
250   PetscFunctionBegin;
251   if (!n) PetscFunctionReturn(0);
252   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353   cudaError_t                       cerr;
354 
355   PetscFunctionBegin;
356   if (!n) PetscFunctionReturn(0);
357   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358     try {
359       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360       nzUpper = adiag[0]-adiag[n];
361 
362       /* Allocate Space for the upper triangular matrix */
363       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
366 
367       /* Fill the upper triangular matrix */
368       AiUp[0]=(PetscInt) 0;
369       AiUp[n]=nzUpper;
370       offset = nzUpper;
371       for (i=n-1; i>=0; i--) {
372         v  = aa + adiag[i+1] + 1;
373         vi = aj + adiag[i+1] + 1;
374 
375         /* number of elements NOT on the diagonal */
376         nz = adiag[i] - adiag[i+1]-1;
377 
378         /* decrement the offset */
379         offset -= (nz+1);
380 
381         /* first, set the diagonal elements */
382         AjUp[offset] = (PetscInt) i;
383         AAUp[offset] = (MatScalar)1./v[nz];
384         AiUp[i]      = AiUp[i+1] - (nz+1);
385 
386         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
387         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
388       }
389 
390       /* allocate space for the triangular factor information */
391       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
392 
393       /* Create the matrix description */
394       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
399 
400       /* Create the solve analysis information */
401       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       upTriFactor->csrMat = new CsrMatrix;
408       upTriFactor->csrMat->num_rows = n;
409       upTriFactor->csrMat->num_cols = n;
410       upTriFactor->csrMat->num_entries = nzUpper;
411 
412       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
414 
415       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
417 
418       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
420 
421       /* perform the solve analysis */
422       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
426 
427       /* assign the pointer. Is this really necessary? */
428       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
429 
430       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
434     } catch(char *ex) {
435       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436     }
437   }
438   PetscFunctionReturn(0);
439 }
440 
441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442 {
443   PetscErrorCode               ierr;
444   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446   IS                           isrow = a->row,iscol = a->icol;
447   PetscBool                    row_identity,col_identity;
448   const PetscInt               *r,*c;
449   PetscInt                     n = A->rmap->n;
450 
451   PetscFunctionBegin;
452   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
453   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
454 
455   cusparseTriFactors->workVector = new THRUSTARRAY(n);
456   cusparseTriFactors->nnz=a->nz;
457 
458   A->offloadmask = PETSC_OFFLOAD_BOTH;
459   /* lower triangular indices */
460   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
461   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
462   if (!row_identity) {
463     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
464     cusparseTriFactors->rpermIndices->assign(r, r+n);
465   }
466   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
467 
468   /* upper triangular indices */
469   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
470   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
471   if (!col_identity) {
472     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
473     cusparseTriFactors->cpermIndices->assign(c, c+n);
474   }
475 
476   if (!row_identity && !col_identity) {
477     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!row_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   } else if(!col_identity) {
481     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
482   }
483 
484   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
485   PetscFunctionReturn(0);
486 }
487 
488 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
489 {
490   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
491   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
492   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
493   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
494   cusparseStatus_t                  stat;
495   PetscErrorCode                    ierr;
496   cudaError_t                       cerr;
497   PetscInt                          *AiUp, *AjUp;
498   PetscScalar                       *AAUp;
499   PetscScalar                       *AALo;
500   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
501   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
502   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
503   const MatScalar                   *aa = b->a,*v;
504 
505   PetscFunctionBegin;
506   if (!n) PetscFunctionReturn(0);
507   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
508     try {
509       /* Allocate Space for the upper triangular matrix */
510       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
511       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
512       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
513       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
514 
515       /* Fill the upper triangular matrix */
516       AiUp[0]=(PetscInt) 0;
517       AiUp[n]=nzUpper;
518       offset = 0;
519       for (i=0; i<n; i++) {
520         /* set the pointers */
521         v  = aa + ai[i];
522         vj = aj + ai[i];
523         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
524 
525         /* first, set the diagonal elements */
526         AjUp[offset] = (PetscInt) i;
527         AAUp[offset] = (MatScalar)1.0/v[nz];
528         AiUp[i]      = offset;
529         AALo[offset] = (MatScalar)1.0/v[nz];
530 
531         offset+=1;
532         if (nz>0) {
533           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
534           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
535           for (j=offset; j<offset+nz; j++) {
536             AAUp[j] = -AAUp[j];
537             AALo[j] = AAUp[j]/v[nz];
538           }
539           offset+=nz;
540         }
541       }
542 
543       /* allocate space for the triangular factor information */
544       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
545 
546       /* Create the matrix description */
547       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
548       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
549       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
550       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
551       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
552 
553       /* Create the solve analysis information */
554       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
555 
556       /* set the operation */
557       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
558 
559       /* set the matrix */
560       upTriFactor->csrMat = new CsrMatrix;
561       upTriFactor->csrMat->num_rows = A->rmap->n;
562       upTriFactor->csrMat->num_cols = A->cmap->n;
563       upTriFactor->csrMat->num_entries = a->nz;
564 
565       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
566       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
567 
568       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
569       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
570 
571       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
572       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
573 
574       /* perform the solve analysis */
575       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
576                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
577                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
578                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
579 
580       /* assign the pointer. Is this really necessary? */
581       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
582 
583       /* allocate space for the triangular factor information */
584       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
585 
586       /* Create the matrix description */
587       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
588       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
589       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
590       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
591       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
592 
593       /* Create the solve analysis information */
594       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
595 
596       /* set the operation */
597       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
598 
599       /* set the matrix */
600       loTriFactor->csrMat = new CsrMatrix;
601       loTriFactor->csrMat->num_rows = A->rmap->n;
602       loTriFactor->csrMat->num_cols = A->cmap->n;
603       loTriFactor->csrMat->num_entries = a->nz;
604 
605       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
606       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
607 
608       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
610 
611       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
612       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
613       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
614 
615       /* perform the solve analysis */
616       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
617                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
618                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
619                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
620 
621       /* assign the pointer. Is this really necessary? */
622       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
623 
624       A->offloadmask = PETSC_OFFLOAD_BOTH;
625       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
626       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
627       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
628       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
629     } catch(char *ex) {
630       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631     }
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
637 {
638   PetscErrorCode               ierr;
639   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641   IS                           ip = a->row;
642   const PetscInt               *rip;
643   PetscBool                    perm_identity;
644   PetscInt                     n = A->rmap->n;
645 
646   PetscFunctionBegin;
647   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
648   cusparseTriFactors->workVector = new THRUSTARRAY(n);
649   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
650 
651   /* lower triangular indices */
652   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
653   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
654   if (!perm_identity) {
655     IS             iip;
656     const PetscInt *irip;
657 
658     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
659     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
660     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
662     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
664     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
665     ierr = ISDestroy(&iip);CHKERRQ(ierr);
666     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
667   }
668   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
669   PetscFunctionReturn(0);
670 }
671 
672 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
673 {
674   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
675   IS             isrow = b->row,iscol = b->col;
676   PetscBool      row_identity,col_identity;
677   PetscErrorCode ierr;
678 
679   PetscFunctionBegin;
680   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
681   /* determine which version of MatSolve needs to be used. */
682   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
683   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
684   if (row_identity && col_identity) {
685     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
686     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
687     B->ops->matsolve = NULL;
688     B->ops->matsolvetranspose = NULL;
689   } else {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
692     B->ops->matsolve = NULL;
693     B->ops->matsolvetranspose = NULL;
694   }
695 
696   /* get the triangular factors */
697   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
698   PetscFunctionReturn(0);
699 }
700 
701 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
702 {
703   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
704   IS             ip = b->row;
705   PetscBool      perm_identity;
706   PetscErrorCode ierr;
707 
708   PetscFunctionBegin;
709   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
710 
711   /* determine which version of MatSolve needs to be used. */
712   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
713   if (perm_identity) {
714     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
715     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
716     B->ops->matsolve = NULL;
717     B->ops->matsolvetranspose = NULL;
718   } else {
719     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
720     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
721     B->ops->matsolve = NULL;
722     B->ops->matsolvetranspose = NULL;
723   }
724 
725   /* get the triangular factors */
726   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
731 {
732   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
733   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
734   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
736   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
737   cusparseStatus_t                  stat;
738   cusparseIndexBase_t               indexBase;
739   cusparseMatrixType_t              matrixType;
740   cusparseFillMode_t                fillMode;
741   cusparseDiagType_t                diagType;
742 
743   PetscFunctionBegin;
744 
745   /*********************************************/
746   /* Now the Transpose of the Lower Tri Factor */
747   /*********************************************/
748 
749   /* allocate space for the transpose of the lower triangular factor */
750   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
751 
752   /* set the matrix descriptors of the lower triangular factor */
753   matrixType = cusparseGetMatType(loTriFactor->descr);
754   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
755   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
756     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
757   diagType = cusparseGetMatDiagType(loTriFactor->descr);
758 
759   /* Create the matrix description */
760   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
761   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
762   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
763   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
764   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
765 
766   /* Create the solve analysis information */
767   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
768 
769   /* set the operation */
770   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
771 
772   /* allocate GPU space for the CSC of the lower triangular factor*/
773   loTriFactorT->csrMat = new CsrMatrix;
774   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
775   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
776   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
777   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
778   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
779   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
780 
781   /* compute the transpose of the lower triangular factor, i.e. the CSC */
782   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
783                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
784                           loTriFactor->csrMat->values->data().get(),
785                           loTriFactor->csrMat->row_offsets->data().get(),
786                           loTriFactor->csrMat->column_indices->data().get(),
787                           loTriFactorT->csrMat->values->data().get(),
788                           loTriFactorT->csrMat->column_indices->data().get(),
789                           loTriFactorT->csrMat->row_offsets->data().get(),
790                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
791 
792   /* perform the solve analysis on the transposed matrix */
793   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
794                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
795                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
796                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
797                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
798 
799   /* assign the pointer. Is this really necessary? */
800   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
801 
802   /*********************************************/
803   /* Now the Transpose of the Upper Tri Factor */
804   /*********************************************/
805 
806   /* allocate space for the transpose of the upper triangular factor */
807   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
808 
809   /* set the matrix descriptors of the upper triangular factor */
810   matrixType = cusparseGetMatType(upTriFactor->descr);
811   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
812   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
813     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
814   diagType = cusparseGetMatDiagType(upTriFactor->descr);
815 
816   /* Create the matrix description */
817   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
818   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
819   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
820   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
821   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
822 
823   /* Create the solve analysis information */
824   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
825 
826   /* set the operation */
827   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
828 
829   /* allocate GPU space for the CSC of the upper triangular factor*/
830   upTriFactorT->csrMat = new CsrMatrix;
831   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
832   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
833   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
834   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
835   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
836   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
837 
838   /* compute the transpose of the upper triangular factor, i.e. the CSC */
839   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
840                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
841                           upTriFactor->csrMat->values->data().get(),
842                           upTriFactor->csrMat->row_offsets->data().get(),
843                           upTriFactor->csrMat->column_indices->data().get(),
844                           upTriFactorT->csrMat->values->data().get(),
845                           upTriFactorT->csrMat->column_indices->data().get(),
846                           upTriFactorT->csrMat->row_offsets->data().get(),
847                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
848 
849   /* perform the solve analysis on the transposed matrix */
850   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
851                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
852                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
853                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
854                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
855 
856   /* assign the pointer. Is this really necessary? */
857   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
858   PetscFunctionReturn(0);
859 }
860 
861 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
862 {
863   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
864   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
865   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
866   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
867   cusparseStatus_t             stat;
868   cusparseIndexBase_t          indexBase;
869   cudaError_t                  err;
870   PetscErrorCode               ierr;
871 
872   PetscFunctionBegin;
873 
874   /* allocate space for the triangular factor information */
875   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
876   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
877   indexBase = cusparseGetMatIndexBase(matstruct->descr);
878   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
879   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
880 
881   /* set alpha and beta */
882   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
883   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
884   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
885   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
886   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
887   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
888   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
889 
890   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892     CsrMatrix *matrixT= new CsrMatrix;
893     thrust::device_vector<int> *rowoffsets_gpu;
894     const int ncomprow = matstruct->cprowIndices->size();
895     thrust::host_vector<int> cprow(ncomprow);
896     cprow = *matstruct->cprowIndices; // GPU --> CPU
897     matrixT->num_rows = A->cmap->n;
898     matrixT->num_cols = A->rmap->n;
899     matrixT->num_entries = a->nz;
900     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
901     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
902     matrixT->values = new THRUSTARRAY(a->nz);
903     PetscInt k,i;
904     thrust::host_vector<int> rowst_host(A->rmap->n+1);
905 
906     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
907     rowst_host[0] = 0;
908     for (k = 0, i = 0; i < A->rmap->n ; i++) {
909       if (k < ncomprow && i==cprow[k]) {
910 	rowst_host[i+1] = a->i[i+1];
911 	k++;
912       } else rowst_host[i+1] = rowst_host[i];
913     }
914     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
915     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
916     *rowoffsets_gpu = rowst_host; // CPU --> GPU
917 
918     /* compute the transpose of the upper triangular factor, i.e. the CSC */
919     indexBase = cusparseGetMatIndexBase(matstruct->descr);
920     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
921                             A->cmap->n, matrix->num_entries,
922                             matrix->values->data().get(),
923                             rowoffsets_gpu->data().get(),
924                             matrix->column_indices->data().get(),
925                             matrixT->values->data().get(),
926                             matrixT->column_indices->data().get(),
927                             matrixT->row_offsets->data().get(),
928                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
929 
930     /* assign the pointer */
931     matstructT->mat = matrixT;
932     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
933   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
934     /* First convert HYB to CSR */
935     CsrMatrix *temp= new CsrMatrix;
936     temp->num_rows = A->rmap->n;
937     temp->num_cols = A->cmap->n;
938     temp->num_entries = a->nz;
939     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
940     temp->column_indices = new THRUSTINTARRAY32(a->nz);
941     temp->values = new THRUSTARRAY(a->nz);
942 
943 
944     stat = cusparse_hyb2csr(cusparsestruct->handle,
945                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
946                             temp->values->data().get(),
947                             temp->row_offsets->data().get(),
948                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
949 
950     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
951     CsrMatrix *tempT= new CsrMatrix;
952     tempT->num_rows = A->rmap->n;
953     tempT->num_cols = A->cmap->n;
954     tempT->num_entries = a->nz;
955     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
956     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
957     tempT->values = new THRUSTARRAY(a->nz);
958 
959     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
960                             temp->num_cols, temp->num_entries,
961                             temp->values->data().get(),
962                             temp->row_offsets->data().get(),
963                             temp->column_indices->data().get(),
964                             tempT->values->data().get(),
965                             tempT->column_indices->data().get(),
966                             tempT->row_offsets->data().get(),
967                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
968 
969     /* Last, convert CSC to HYB */
970     cusparseHybMat_t hybMat;
971     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
972     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
973       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
974     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
975                             matstructT->descr, tempT->values->data().get(),
976                             tempT->row_offsets->data().get(),
977                             tempT->column_indices->data().get(),
978                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
979 
980     /* assign the pointer */
981     matstructT->mat = hybMat;
982     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
983 
984     /* delete temporaries */
985     if (tempT) {
986       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
987       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
988       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
989       delete (CsrMatrix*) tempT;
990     }
991     if (temp) {
992       if (temp->values) delete (THRUSTARRAY*) temp->values;
993       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
994       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
995       delete (CsrMatrix*) temp;
996     }
997   }
998   /* assign the compressed row indices */
999   matstructT->cprowIndices = new THRUSTINTARRAY;
1000   matstructT->cprowIndices->resize(A->cmap->n);
1001   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
1002   /* assign the pointer */
1003   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1008 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1009 {
1010   PetscInt                              n = xx->map->n;
1011   const PetscScalar                     *barray;
1012   PetscScalar                           *xarray;
1013   thrust::device_ptr<const PetscScalar> bGPU;
1014   thrust::device_ptr<PetscScalar>       xGPU;
1015   cusparseStatus_t                      stat;
1016   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1017   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1018   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1019   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1020   PetscErrorCode                        ierr;
1021   cudaError_t                           cerr;
1022 
1023   PetscFunctionBegin;
1024   /* Analyze the matrix and create the transpose ... on the fly */
1025   if (!loTriFactorT && !upTriFactorT) {
1026     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1027     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1028     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1029   }
1030 
1031   /* Get the GPU pointers */
1032   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1033   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1034   xGPU = thrust::device_pointer_cast(xarray);
1035   bGPU = thrust::device_pointer_cast(barray);
1036 
1037   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1038   /* First, reorder with the row permutation */
1039   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1040                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1041                xGPU);
1042 
1043   /* First, solve U */
1044   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1045                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1046                         upTriFactorT->csrMat->values->data().get(),
1047                         upTriFactorT->csrMat->row_offsets->data().get(),
1048                         upTriFactorT->csrMat->column_indices->data().get(),
1049                         upTriFactorT->solveInfo,
1050                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1051 
1052   /* Then, solve L */
1053   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1054                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1055                         loTriFactorT->csrMat->values->data().get(),
1056                         loTriFactorT->csrMat->row_offsets->data().get(),
1057                         loTriFactorT->csrMat->column_indices->data().get(),
1058                         loTriFactorT->solveInfo,
1059                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1060 
1061   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1062   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1063                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1064                tempGPU->begin());
1065 
1066   /* Copy the temporary to the full solution. */
1067   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1068 
1069   /* restore */
1070   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1071   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1072   cerr = WaitForGPU();CHKERRCUDA(cerr);
1073   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1074   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1075   PetscFunctionReturn(0);
1076 }
1077 
1078 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1079 {
1080   const PetscScalar                 *barray;
1081   PetscScalar                       *xarray;
1082   cusparseStatus_t                  stat;
1083   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1084   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1085   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1086   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1087   PetscErrorCode                    ierr;
1088   cudaError_t                       cerr;
1089 
1090   PetscFunctionBegin;
1091   /* Analyze the matrix and create the transpose ... on the fly */
1092   if (!loTriFactorT && !upTriFactorT) {
1093     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1094     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1095     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1096   }
1097 
1098   /* Get the GPU pointers */
1099   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1100   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1101 
1102   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1103   /* First, solve U */
1104   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1105                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1106                         upTriFactorT->csrMat->values->data().get(),
1107                         upTriFactorT->csrMat->row_offsets->data().get(),
1108                         upTriFactorT->csrMat->column_indices->data().get(),
1109                         upTriFactorT->solveInfo,
1110                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1111 
1112   /* Then, solve L */
1113   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1114                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1115                         loTriFactorT->csrMat->values->data().get(),
1116                         loTriFactorT->csrMat->row_offsets->data().get(),
1117                         loTriFactorT->csrMat->column_indices->data().get(),
1118                         loTriFactorT->solveInfo,
1119                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1120 
1121   /* restore */
1122   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1123   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1124   cerr = WaitForGPU();CHKERRCUDA(cerr);
1125   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1126   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1127   PetscFunctionReturn(0);
1128 }
1129 
1130 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1131 {
1132   const PetscScalar                     *barray;
1133   PetscScalar                           *xarray;
1134   thrust::device_ptr<const PetscScalar> bGPU;
1135   thrust::device_ptr<PetscScalar>       xGPU;
1136   cusparseStatus_t                      stat;
1137   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1138   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1139   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1140   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1141   PetscErrorCode                        ierr;
1142   cudaError_t                           cerr;
1143 
1144   PetscFunctionBegin;
1145 
1146   /* Get the GPU pointers */
1147   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1148   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1149   xGPU = thrust::device_pointer_cast(xarray);
1150   bGPU = thrust::device_pointer_cast(barray);
1151 
1152   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1153   /* First, reorder with the row permutation */
1154   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1155                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1156                tempGPU->begin());
1157 
1158   /* Next, solve L */
1159   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1160                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1161                         loTriFactor->csrMat->values->data().get(),
1162                         loTriFactor->csrMat->row_offsets->data().get(),
1163                         loTriFactor->csrMat->column_indices->data().get(),
1164                         loTriFactor->solveInfo,
1165                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1166 
1167   /* Then, solve U */
1168   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1169                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1170                         upTriFactor->csrMat->values->data().get(),
1171                         upTriFactor->csrMat->row_offsets->data().get(),
1172                         upTriFactor->csrMat->column_indices->data().get(),
1173                         upTriFactor->solveInfo,
1174                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1175 
1176   /* Last, reorder with the column permutation */
1177   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1178                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1179                xGPU);
1180 
1181   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1182   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1183   cerr = WaitForGPU();CHKERRCUDA(cerr);
1184   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1185   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1186   PetscFunctionReturn(0);
1187 }
1188 
1189 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1190 {
1191   const PetscScalar                 *barray;
1192   PetscScalar                       *xarray;
1193   cusparseStatus_t                  stat;
1194   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1195   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1196   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1197   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1198   PetscErrorCode                    ierr;
1199   cudaError_t                       cerr;
1200 
1201   PetscFunctionBegin;
1202   /* Get the GPU pointers */
1203   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1204   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1205 
1206   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1207   /* First, solve L */
1208   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1209                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1210                         loTriFactor->csrMat->values->data().get(),
1211                         loTriFactor->csrMat->row_offsets->data().get(),
1212                         loTriFactor->csrMat->column_indices->data().get(),
1213                         loTriFactor->solveInfo,
1214                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1215 
1216   /* Next, solve U */
1217   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1218                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1219                         upTriFactor->csrMat->values->data().get(),
1220                         upTriFactor->csrMat->row_offsets->data().get(),
1221                         upTriFactor->csrMat->column_indices->data().get(),
1222                         upTriFactor->solveInfo,
1223                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1224 
1225   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1226   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1227   cerr = WaitForGPU();CHKERRCUDA(cerr);
1228   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1229   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1230   PetscFunctionReturn(0);
1231 }
1232 
1233 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1234 {
1235   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1236   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1237   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1238   PetscInt                     m = A->rmap->n,*ii,*ridx;
1239   PetscErrorCode               ierr;
1240   cusparseStatus_t             stat;
1241   cudaError_t                  err;
1242 
1243   PetscFunctionBegin;
1244   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1245     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1246     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1247       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1248       /* copy values only */
1249       matrix->values->assign(a->a, a->a+a->nz);
1250       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1251     } else {
1252       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1253       try {
1254         cusparsestruct->nonzerorow=0;
1255         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1256 
1257         if (a->compressedrow.use) {
1258           m    = a->compressedrow.nrows;
1259           ii   = a->compressedrow.i;
1260           ridx = a->compressedrow.rindex;
1261         } else {
1262           /* Forcing compressed row on the GPU */
1263           int k=0;
1264           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1265           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1266           ii[0]=0;
1267           for (int j = 0; j<m; j++) {
1268             if ((a->i[j+1]-a->i[j])>0) {
1269               ii[k]  = a->i[j];
1270               ridx[k]= j;
1271               k++;
1272             }
1273           }
1274           ii[cusparsestruct->nonzerorow] = a->nz;
1275           m = cusparsestruct->nonzerorow;
1276         }
1277 
1278         /* allocate space for the triangular factor information */
1279         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1280         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1281         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1282         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1283 
1284         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1285         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1286         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1287         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1288         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1289         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1290         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1291 
1292         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1293         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1294           /* set the matrix */
1295           CsrMatrix *matrix= new CsrMatrix;
1296           matrix->num_rows = m;
1297           matrix->num_cols = A->cmap->n;
1298           matrix->num_entries = a->nz;
1299           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1300           matrix->row_offsets->assign(ii, ii + m+1);
1301 
1302           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1303           matrix->column_indices->assign(a->j, a->j+a->nz);
1304 
1305           matrix->values = new THRUSTARRAY(a->nz);
1306           matrix->values->assign(a->a, a->a+a->nz);
1307 
1308           /* assign the pointer */
1309           matstruct->mat = matrix;
1310 
1311         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1312           CsrMatrix *matrix= new CsrMatrix;
1313           matrix->num_rows = m;
1314           matrix->num_cols = A->cmap->n;
1315           matrix->num_entries = a->nz;
1316           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1317           matrix->row_offsets->assign(ii, ii + m+1);
1318 
1319           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1320           matrix->column_indices->assign(a->j, a->j+a->nz);
1321 
1322           matrix->values = new THRUSTARRAY(a->nz);
1323           matrix->values->assign(a->a, a->a+a->nz);
1324 
1325           cusparseHybMat_t hybMat;
1326           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1327           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1328             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1329           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1330               matstruct->descr, matrix->values->data().get(),
1331               matrix->row_offsets->data().get(),
1332               matrix->column_indices->data().get(),
1333               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1334           /* assign the pointer */
1335           matstruct->mat = hybMat;
1336 
1337           if (matrix) {
1338             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1339             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1340             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1341             delete (CsrMatrix*)matrix;
1342           }
1343         }
1344 
1345         /* assign the compressed row indices */
1346         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1347         matstruct->cprowIndices->assign(ridx,ridx+m);
1348         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1349 
1350         /* assign the pointer */
1351         cusparsestruct->mat = matstruct;
1352 
1353         if (!a->compressedrow.use) {
1354           ierr = PetscFree(ii);CHKERRQ(ierr);
1355           ierr = PetscFree(ridx);CHKERRQ(ierr);
1356         }
1357         cusparsestruct->workVector = new THRUSTARRAY(m);
1358       } catch(char *ex) {
1359         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1360       }
1361       cusparsestruct->nonzerostate = A->nonzerostate;
1362     }
1363     err  = WaitForGPU();CHKERRCUDA(err);
1364     A->offloadmask = PETSC_OFFLOAD_BOTH;
1365     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1366   }
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 struct VecCUDAPlusEquals
1371 {
1372   template <typename Tuple>
1373   __host__ __device__
1374   void operator()(Tuple t)
1375   {
1376     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1377   }
1378 };
1379 
1380 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1381 {
1382   PetscErrorCode ierr;
1383 
1384   PetscFunctionBegin;
1385   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1386   PetscFunctionReturn(0);
1387 }
1388 
1389 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1390 {
1391   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1392   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1393   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1394   const PetscScalar            *xarray;
1395   PetscScalar                  *yarray;
1396   PetscErrorCode               ierr;
1397   cudaError_t                  cerr;
1398   cusparseStatus_t             stat;
1399 
1400   PetscFunctionBegin;
1401   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1402   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1403   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1404   if (!matstructT) {
1405     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1406     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1407   }
1408   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1409   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1410   if (yy->map->n) {
1411     PetscInt                     n = yy->map->n;
1412     cudaError_t                  err;
1413     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1414   }
1415 
1416   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1417   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1418     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1419     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1420                              mat->num_rows, mat->num_cols,
1421                              mat->num_entries, matstructT->alpha, matstructT->descr,
1422                              mat->values->data().get(), mat->row_offsets->data().get(),
1423                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1424                              yarray);CHKERRCUSPARSE(stat);
1425   } else {
1426     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1427     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1428                              matstructT->alpha, matstructT->descr, hybMat,
1429                              xarray, matstructT->beta_zero,
1430                              yarray);CHKERRCUSPARSE(stat);
1431   }
1432   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1433   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1434   cerr = WaitForGPU();CHKERRCUDA(cerr);
1435   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1436   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1437   PetscFunctionReturn(0);
1438 }
1439 
1440 
1441 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1442 {
1443   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1444   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1445   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1446   const PetscScalar            *xarray;
1447   PetscScalar                  *zarray,*dptr,*beta;
1448   PetscErrorCode               ierr;
1449   cudaError_t                  cerr;
1450   cusparseStatus_t             stat;
1451   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1452 
1453   PetscFunctionBegin;
1454   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1455   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1456   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1457   try {
1458     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1459     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1460     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1461       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1462     } else {
1463       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1464     }
1465     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1466     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1467 
1468     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1469     /* csr_spmv is multiply add */
1470     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1471       /* here we need to be careful to set the number of rows in the multiply to the
1472          number of compressed rows in the matrix ... which is equivalent to the
1473          size of the workVector */
1474       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1475       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1476                                mat->num_rows, mat->num_cols,
1477                                mat->num_entries, matstruct->alpha, matstruct->descr,
1478                                mat->values->data().get(), mat->row_offsets->data().get(),
1479                                mat->column_indices->data().get(), xarray, beta,
1480                                dptr);CHKERRCUSPARSE(stat);
1481     } else {
1482       if (cusparsestruct->workVector->size()) {
1483         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1484         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1485                                  matstruct->alpha, matstruct->descr, hybMat,
1486                                  xarray, beta,
1487                                  dptr);CHKERRCUSPARSE(stat);
1488       }
1489     }
1490     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1491 
1492     if (yy) {
1493       if (dptr != zarray) {
1494         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1495       } else if (zz != yy) {
1496         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1497       }
1498     } else if (dptr != zarray) {
1499       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1500     }
1501     /* scatter the data from the temporary into the full vector with a += operation */
1502     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1503     if (dptr != zarray) {
1504       thrust::device_ptr<PetscScalar> zptr;
1505 
1506       zptr = thrust::device_pointer_cast(zarray);
1507       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1508                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1509                        VecCUDAPlusEquals());
1510     }
1511     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1512     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1513     if (yy && !cmpr) {
1514       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1515     } else {
1516       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1517     }
1518   } catch(char *ex) {
1519     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1520   }
1521   cerr = WaitForGPU();CHKERRCUDA(cerr);
1522   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1523   PetscFunctionReturn(0);
1524 }
1525 
1526 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1527 {
1528   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1529   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1530   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1531   const PetscScalar               *xarray;
1532   PetscScalar                     *zarray,*dptr,*beta;
1533   PetscErrorCode                  ierr;
1534   cudaError_t                     cerr;
1535   cusparseStatus_t                stat;
1536 
1537   PetscFunctionBegin;
1538   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1539   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1540   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1541   if (!matstructT) {
1542     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1543     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544   }
1545 
1546   try {
1547     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1548     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1549     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1550     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1551     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1552 
1553     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1554     /* multiply add with matrix transpose */
1555     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1556       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1557       /* here we need to be careful to set the number of rows in the multiply to the
1558          number of compressed rows in the matrix ... which is equivalent to the
1559          size of the workVector */
1560       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1561                                mat->num_rows, mat->num_cols,
1562                                mat->num_entries, matstructT->alpha, matstructT->descr,
1563                                mat->values->data().get(), mat->row_offsets->data().get(),
1564                                mat->column_indices->data().get(), xarray, beta,
1565                                dptr);CHKERRCUSPARSE(stat);
1566     } else {
1567       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1568       if (cusparsestruct->workVector->size()) {
1569         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1570                                  matstructT->alpha, matstructT->descr, hybMat,
1571                                  xarray, beta,
1572                                  dptr);CHKERRCUSPARSE(stat);
1573       }
1574     }
1575     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1576 
1577     if (dptr != zarray) {
1578       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1579     } else if (zz != yy) {
1580       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1581     }
1582     /* scatter the data from the temporary into the full vector with a += operation */
1583     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1584     if (dptr != zarray) {
1585       thrust::device_ptr<PetscScalar> zptr;
1586 
1587       zptr = thrust::device_pointer_cast(zarray);
1588 
1589       /* scatter the data from the temporary into the full vector with a += operation */
1590       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1591                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1592                        VecCUDAPlusEquals());
1593     }
1594     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1595     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1596     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1597   } catch(char *ex) {
1598     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1599   }
1600   cerr = WaitForGPU();CHKERRCUDA(cerr); /* is this needed? just for yy==0 in Mult */
1601   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1606 {
1607   PetscErrorCode ierr;
1608 
1609   PetscFunctionBegin;
1610   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1611   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1612   if (A->factortype == MAT_FACTOR_NONE) {
1613     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1614   }
1615   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1616   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1617   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1618   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 /* --------------------------------------------------------------------------------*/
1623 /*@
1624    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1625    (the default parallel PETSc format). This matrix will ultimately pushed down
1626    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1627    assembly performance the user should preallocate the matrix storage by setting
1628    the parameter nz (or the array nnz).  By setting these parameters accurately,
1629    performance during matrix assembly can be increased by more than a factor of 50.
1630 
1631    Collective
1632 
1633    Input Parameters:
1634 +  comm - MPI communicator, set to PETSC_COMM_SELF
1635 .  m - number of rows
1636 .  n - number of columns
1637 .  nz - number of nonzeros per row (same for all rows)
1638 -  nnz - array containing the number of nonzeros in the various rows
1639          (possibly different for each row) or NULL
1640 
1641    Output Parameter:
1642 .  A - the matrix
1643 
1644    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1645    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1646    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1647 
1648    Notes:
1649    If nnz is given then nz is ignored
1650 
1651    The AIJ format (also called the Yale sparse matrix format or
1652    compressed row storage), is fully compatible with standard Fortran 77
1653    storage.  That is, the stored row and column indices can begin at
1654    either one (as in Fortran) or zero.  See the users' manual for details.
1655 
1656    Specify the preallocated storage with either nz or nnz (not both).
1657    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1658    allocation.  For large problems you MUST preallocate memory or you
1659    will get TERRIBLE performance, see the users' manual chapter on matrices.
1660 
1661    By default, this format uses inodes (identical nodes) when possible, to
1662    improve numerical efficiency of matrix-vector products and solves. We
1663    search for consecutive rows with the same nonzero structure, thereby
1664    reusing matrix information to achieve increased efficiency.
1665 
1666    Level: intermediate
1667 
1668 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1669 @*/
1670 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1671 {
1672   PetscErrorCode ierr;
1673 
1674   PetscFunctionBegin;
1675   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1676   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1677   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1678   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1683 {
1684   PetscErrorCode   ierr;
1685 
1686   PetscFunctionBegin;
1687   if (A->factortype==MAT_FACTOR_NONE) {
1688     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1689       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1690     }
1691   } else {
1692     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1693   }
1694   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1695   PetscFunctionReturn(0);
1696 }
1697 
1698 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1699 {
1700   PetscErrorCode ierr;
1701   Mat C;
1702   cusparseStatus_t stat;
1703   cusparseHandle_t handle=0;
1704 
1705   PetscFunctionBegin;
1706   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1707   C    = *B;
1708   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1709   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1710 
1711   /* inject CUSPARSE-specific stuff */
1712   if (C->factortype==MAT_FACTOR_NONE) {
1713     /* you cannot check the inode.use flag here since the matrix was just created.
1714        now build a GPU matrix data structure */
1715     C->spptr = new Mat_SeqAIJCUSPARSE;
1716     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1717     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1718     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1719     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1720     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1721     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1722     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1723     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1724   } else {
1725     /* NEXT, set the pointers to the triangular factors */
1726     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1727     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1728     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1729     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1731     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1733     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1735     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1736     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1737     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1738   }
1739 
1740   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1741   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1742   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1743   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1744   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1745   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1746   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1747   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1748 
1749   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1750 
1751   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1752 
1753   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1754   PetscFunctionReturn(0);
1755 }
1756 
1757 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1758 {
1759   PetscErrorCode ierr;
1760   cusparseStatus_t stat;
1761   cusparseHandle_t handle=0;
1762 
1763   PetscFunctionBegin;
1764   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1765   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1766 
1767   if (B->factortype==MAT_FACTOR_NONE) {
1768     /* you cannot check the inode.use flag here since the matrix was just created.
1769        now build a GPU matrix data structure */
1770     B->spptr = new Mat_SeqAIJCUSPARSE;
1771     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1772     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1773     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1774     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1775     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1776     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1777     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1778     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1779   } else {
1780     /* NEXT, set the pointers to the triangular factors */
1781     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1782     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1783     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1784     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1785     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1786     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1787     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1788     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1789     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1790     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1791     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1792   }
1793 
1794   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1795   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1796   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1797   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1798   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1799   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1800   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1801   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1802 
1803   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1804 
1805   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1806 
1807   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1808   PetscFunctionReturn(0);
1809 }
1810 
1811 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1812 {
1813   PetscErrorCode ierr;
1814 
1815   PetscFunctionBegin;
1816   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1817   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1818   PetscFunctionReturn(0);
1819 }
1820 
1821 /*MC
1822    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1823 
1824    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1825    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1826    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1827 
1828    Options Database Keys:
1829 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1830 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1831 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1832 
1833   Level: beginner
1834 
1835 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1836 M*/
1837 
1838 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1839 
1840 
1841 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1842 {
1843   PetscErrorCode ierr;
1844 
1845   PetscFunctionBegin;
1846   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1847   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1848   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1849   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 
1854 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1855 {
1856   cusparseStatus_t stat;
1857   cusparseHandle_t handle;
1858 
1859   PetscFunctionBegin;
1860   if (*cusparsestruct) {
1861     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1862     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1863     delete (*cusparsestruct)->workVector;
1864     if (handle = (*cusparsestruct)->handle) {
1865       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1866     }
1867     delete *cusparsestruct;
1868     *cusparsestruct = 0;
1869   }
1870   PetscFunctionReturn(0);
1871 }
1872 
1873 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1874 {
1875   PetscFunctionBegin;
1876   if (*mat) {
1877     delete (*mat)->values;
1878     delete (*mat)->column_indices;
1879     delete (*mat)->row_offsets;
1880     delete *mat;
1881     *mat = 0;
1882   }
1883   PetscFunctionReturn(0);
1884 }
1885 
1886 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1887 {
1888   cusparseStatus_t stat;
1889   PetscErrorCode   ierr;
1890 
1891   PetscFunctionBegin;
1892   if (*trifactor) {
1893     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
1894     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
1895     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1896     delete *trifactor;
1897     *trifactor = 0;
1898   }
1899   PetscFunctionReturn(0);
1900 }
1901 
1902 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1903 {
1904   CsrMatrix        *mat;
1905   cusparseStatus_t stat;
1906   cudaError_t      err;
1907 
1908   PetscFunctionBegin;
1909   if (*matstruct) {
1910     if ((*matstruct)->mat) {
1911       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1912         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1913         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
1914       } else {
1915         mat = (CsrMatrix*)(*matstruct)->mat;
1916         CsrMatrix_Destroy(&mat);
1917       }
1918     }
1919     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
1920     delete (*matstruct)->cprowIndices;
1921     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1922     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1923     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1924     delete *matstruct;
1925     *matstruct = 0;
1926   }
1927   PetscFunctionReturn(0);
1928 }
1929 
1930 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1931 {
1932   cusparseHandle_t handle;
1933   cusparseStatus_t stat;
1934 
1935   PetscFunctionBegin;
1936   if (*trifactors) {
1937     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1938     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1939     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1940     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1941     delete (*trifactors)->rpermIndices;
1942     delete (*trifactors)->cpermIndices;
1943     delete (*trifactors)->workVector;
1944     if (handle = (*trifactors)->handle) {
1945       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1946     }
1947     delete *trifactors;
1948     *trifactors = 0;
1949   }
1950   PetscFunctionReturn(0);
1951 }
1952 
1953