xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 63f3c55c12ae2f190c391f6df6c540efe018a44a)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42 
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55 {
56   cusparseStatus_t   stat;
57   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58 
59   PetscFunctionBegin;
60   if (cusparsestruct->handle != handle) {
61     if (cusparsestruct->handle) {
62       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
63     }
64     cusparsestruct->handle = handle;
65   }
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80 {
81   PetscFunctionBegin;
82   *type = MATSOLVERCUSPARSE;
83   PetscFunctionReturn(0);
84 }
85 
86 /*MC
87   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92   algorithms are not recommended. This class does NOT support direct solver operations.
93 
94   Level: beginner
95 
96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97 M*/
98 
99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100 {
101   PetscErrorCode ierr;
102   PetscInt       n = A->rmap->n;
103 
104   PetscFunctionBegin;
105   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
106   (*B)->factortype = ftype;
107   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
108   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
109 
110   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
112     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
118 
119   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
120   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   if (A->factortype==MAT_FACTOR_NONE) {
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
183     }
184   }
185   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187   if (flg) {
188     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248 
249   PetscFunctionBegin;
250   if (!n) PetscFunctionReturn(0);
251   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
252     try {
253       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
254       nzLower=n+ai[n]-ai[1];
255 
256       /* Allocate Space for the lower triangular matrix */
257       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
258       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
260 
261       /* Fill the lower triangular matrix */
262       AiLo[0]  = (PetscInt) 0;
263       AiLo[n]  = nzLower;
264       AjLo[0]  = (PetscInt) 0;
265       AALo[0]  = (MatScalar) 1.0;
266       v        = aa;
267       vi       = aj;
268       offset   = 1;
269       rowOffset= 1;
270       for (i=1; i<n; i++) {
271         nz = ai[i+1] - ai[i];
272         /* additional 1 for the term on the diagonal */
273         AiLo[i]    = rowOffset;
274         rowOffset += nz+1;
275 
276         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
277         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
278 
279         offset      += nz;
280         AjLo[offset] = (PetscInt) i;
281         AALo[offset] = (MatScalar) 1.0;
282         offset      += 1;
283 
284         v  += nz;
285         vi += nz;
286       }
287 
288       /* allocate space for the triangular factor information */
289       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
290 
291       /* Create the matrix description */
292       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
293       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
294       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
295       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
296       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
297 
298       /* Create the solve analysis information */
299       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
300 
301       /* set the operation */
302       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
303 
304       /* set the matrix */
305       loTriFactor->csrMat = new CsrMatrix;
306       loTriFactor->csrMat->num_rows = n;
307       loTriFactor->csrMat->num_cols = n;
308       loTriFactor->csrMat->num_entries = nzLower;
309 
310       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
311       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
312 
313       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
314       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
315 
316       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
317       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
318 
319       /* perform the solve analysis */
320       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
321                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
322                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
323                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
324 
325       /* assign the pointer. Is this really necessary? */
326       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
327 
328       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
329       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
331       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
332     } catch(char *ex) {
333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334     }
335   }
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340 {
341   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342   PetscInt                          n = A->rmap->n;
343   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345   cusparseStatus_t                  stat;
346   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347   const MatScalar                   *aa = a->a,*v;
348   PetscInt                          *AiUp, *AjUp;
349   PetscScalar                       *AAUp;
350   PetscInt                          i,nz, nzUpper, offset;
351   PetscErrorCode                    ierr;
352 
353   PetscFunctionBegin;
354   if (!n) PetscFunctionReturn(0);
355   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->offloadmask = PETSC_OFFLOAD_BOTH;
457   /* lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /* upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if (!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (!n) PetscFunctionReturn(0);
504   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
505     try {
506       /* Allocate Space for the upper triangular matrix */
507       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
511 
512       /* Fill the upper triangular matrix */
513       AiUp[0]=(PetscInt) 0;
514       AiUp[n]=nzUpper;
515       offset = 0;
516       for (i=0; i<n; i++) {
517         /* set the pointers */
518         v  = aa + ai[i];
519         vj = aj + ai[i];
520         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
521 
522         /* first, set the diagonal elements */
523         AjUp[offset] = (PetscInt) i;
524         AAUp[offset] = (MatScalar)1.0/v[nz];
525         AiUp[i]      = offset;
526         AALo[offset] = (MatScalar)1.0/v[nz];
527 
528         offset+=1;
529         if (nz>0) {
530           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
531           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
532           for (j=offset; j<offset+nz; j++) {
533             AAUp[j] = -AAUp[j];
534             AALo[j] = AAUp[j]/v[nz];
535           }
536           offset+=nz;
537         }
538       }
539 
540       /* allocate space for the triangular factor information */
541       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
542 
543       /* Create the matrix description */
544       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
545       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
546       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
547       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
548       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
549 
550       /* Create the solve analysis information */
551       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
552 
553       /* set the operation */
554       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
555 
556       /* set the matrix */
557       upTriFactor->csrMat = new CsrMatrix;
558       upTriFactor->csrMat->num_rows = A->rmap->n;
559       upTriFactor->csrMat->num_cols = A->cmap->n;
560       upTriFactor->csrMat->num_entries = a->nz;
561 
562       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
563       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
564 
565       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
566       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
567 
568       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
569       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
570 
571       /* perform the solve analysis */
572       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
576 
577       /* assign the pointer. Is this really necessary? */
578       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
579 
580       /* allocate space for the triangular factor information */
581       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
582 
583       /* Create the matrix description */
584       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
585       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
586       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
587       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
588       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
589 
590       /* Create the solve analysis information */
591       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
592 
593       /* set the operation */
594       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
595 
596       /* set the matrix */
597       loTriFactor->csrMat = new CsrMatrix;
598       loTriFactor->csrMat->num_rows = A->rmap->n;
599       loTriFactor->csrMat->num_cols = A->cmap->n;
600       loTriFactor->csrMat->num_entries = a->nz;
601 
602       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
603       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
604 
605       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
606       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
607 
608       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
610       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
611 
612       /* perform the solve analysis */
613       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
614                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
615                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
616                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
617 
618       /* assign the pointer. Is this really necessary? */
619       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
620 
621       A->offloadmask = PETSC_OFFLOAD_BOTH;
622       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
625       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
626     } catch(char *ex) {
627       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628     }
629   }
630   PetscFunctionReturn(0);
631 }
632 
633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
634 {
635   PetscErrorCode               ierr;
636   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
637   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638   IS                           ip = a->row;
639   const PetscInt               *rip;
640   PetscBool                    perm_identity;
641   PetscInt                     n = A->rmap->n;
642 
643   PetscFunctionBegin;
644   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
645   cusparseTriFactors->workVector = new THRUSTARRAY(n);
646   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
647 
648   /* lower triangular indices */
649   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
650   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
651   if (!perm_identity) {
652     IS             iip;
653     const PetscInt *irip;
654 
655     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
656     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
657     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
659     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
660     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
661     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
662     ierr = ISDestroy(&iip);CHKERRQ(ierr);
663     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
664   }
665   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
670 {
671   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
672   IS             isrow = b->row,iscol = b->col;
673   PetscBool      row_identity,col_identity;
674   PetscErrorCode ierr;
675 
676   PetscFunctionBegin;
677   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
678   /* determine which version of MatSolve needs to be used. */
679   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
680   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
681   if (row_identity && col_identity) {
682     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
683     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
684     B->ops->matsolve = NULL;
685     B->ops->matsolvetranspose = NULL;
686   } else {
687     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
688     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
689     B->ops->matsolve = NULL;
690     B->ops->matsolvetranspose = NULL;
691   }
692 
693   /* get the triangular factors */
694   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
699 {
700   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
701   IS             ip = b->row;
702   PetscBool      perm_identity;
703   PetscErrorCode ierr;
704 
705   PetscFunctionBegin;
706   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
707 
708   /* determine which version of MatSolve needs to be used. */
709   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
710   if (perm_identity) {
711     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
712     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
713     B->ops->matsolve = NULL;
714     B->ops->matsolvetranspose = NULL;
715   } else {
716     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
717     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
718     B->ops->matsolve = NULL;
719     B->ops->matsolvetranspose = NULL;
720   }
721 
722   /* get the triangular factors */
723   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
724   PetscFunctionReturn(0);
725 }
726 
727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
728 {
729   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
730   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
731   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
732   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
733   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
734   cusparseStatus_t                  stat;
735   cusparseIndexBase_t               indexBase;
736   cusparseMatrixType_t              matrixType;
737   cusparseFillMode_t                fillMode;
738   cusparseDiagType_t                diagType;
739 
740   PetscFunctionBegin;
741 
742   /*********************************************/
743   /* Now the Transpose of the Lower Tri Factor */
744   /*********************************************/
745 
746   /* allocate space for the transpose of the lower triangular factor */
747   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
748 
749   /* set the matrix descriptors of the lower triangular factor */
750   matrixType = cusparseGetMatType(loTriFactor->descr);
751   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
752   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
753     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
754   diagType = cusparseGetMatDiagType(loTriFactor->descr);
755 
756   /* Create the matrix description */
757   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
758   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
759   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
760   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
761   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
762 
763   /* Create the solve analysis information */
764   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
765 
766   /* set the operation */
767   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768 
769   /* allocate GPU space for the CSC of the lower triangular factor*/
770   loTriFactorT->csrMat = new CsrMatrix;
771   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
772   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
773   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
774   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
775   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
776   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
777 
778   /* compute the transpose of the lower triangular factor, i.e. the CSC */
779   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
780                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
781                           loTriFactor->csrMat->values->data().get(),
782                           loTriFactor->csrMat->row_offsets->data().get(),
783                           loTriFactor->csrMat->column_indices->data().get(),
784                           loTriFactorT->csrMat->values->data().get(),
785                           loTriFactorT->csrMat->column_indices->data().get(),
786                           loTriFactorT->csrMat->row_offsets->data().get(),
787                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
788 
789   /* perform the solve analysis on the transposed matrix */
790   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
791                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
792                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
793                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
794                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
795 
796   /* assign the pointer. Is this really necessary? */
797   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
798 
799   /*********************************************/
800   /* Now the Transpose of the Upper Tri Factor */
801   /*********************************************/
802 
803   /* allocate space for the transpose of the upper triangular factor */
804   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
805 
806   /* set the matrix descriptors of the upper triangular factor */
807   matrixType = cusparseGetMatType(upTriFactor->descr);
808   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
809   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
810     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
811   diagType = cusparseGetMatDiagType(upTriFactor->descr);
812 
813   /* Create the matrix description */
814   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
815   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
816   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
817   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
818   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
819 
820   /* Create the solve analysis information */
821   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
822 
823   /* set the operation */
824   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
825 
826   /* allocate GPU space for the CSC of the upper triangular factor*/
827   upTriFactorT->csrMat = new CsrMatrix;
828   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
829   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
830   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
831   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
832   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
833   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
834 
835   /* compute the transpose of the upper triangular factor, i.e. the CSC */
836   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
837                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
838                           upTriFactor->csrMat->values->data().get(),
839                           upTriFactor->csrMat->row_offsets->data().get(),
840                           upTriFactor->csrMat->column_indices->data().get(),
841                           upTriFactorT->csrMat->values->data().get(),
842                           upTriFactorT->csrMat->column_indices->data().get(),
843                           upTriFactorT->csrMat->row_offsets->data().get(),
844                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
845 
846   /* perform the solve analysis on the transposed matrix */
847   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
848                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
849                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
850                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
851                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
852 
853   /* assign the pointer. Is this really necessary? */
854   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
855   PetscFunctionReturn(0);
856 }
857 
858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
859 {
860   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
861   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
862   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
863   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
864   cusparseStatus_t             stat;
865   cusparseIndexBase_t          indexBase;
866   cudaError_t                  err;
867   PetscErrorCode               ierr;
868 
869   PetscFunctionBegin;
870 
871   /* allocate space for the triangular factor information */
872   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
873   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
874   indexBase = cusparseGetMatIndexBase(matstruct->descr);
875   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
876   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
877 
878   /* set alpha and beta */
879   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
880   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
881   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
882   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
883   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
885   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
886 
887   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
888     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
889     CsrMatrix *matrixT= new CsrMatrix;
890     thrust::device_vector<int> *rowoffsets_gpu;
891     const int ncomprow = matstruct->cprowIndices->size();
892     thrust::host_vector<int> cprow(ncomprow);
893     cprow = *matstruct->cprowIndices; // GPU --> CPU
894     matrixT->num_rows = A->cmap->n;
895     matrixT->num_cols = A->rmap->n;
896     matrixT->num_entries = a->nz;
897     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
898     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
899     matrixT->values = new THRUSTARRAY(a->nz);
900     PetscInt k,i;
901     thrust::host_vector<int> rowst_host(A->rmap->n+1);
902 
903     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
904     rowst_host[0] = 0;
905     for (k = 0, i = 0; i < A->rmap->n ; i++) {
906       if (k < ncomprow && i==cprow[k]) {
907 	rowst_host[i+1] = a->i[i+1];
908 	k++;
909       } else rowst_host[i+1] = rowst_host[i];
910     }
911     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
912     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
913     *rowoffsets_gpu = rowst_host; // CPU --> GPU
914 
915     /* compute the transpose of the upper triangular factor, i.e. the CSC */
916     indexBase = cusparseGetMatIndexBase(matstruct->descr);
917     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
918                             A->cmap->n, matrix->num_entries,
919                             matrix->values->data().get(),
920                             rowoffsets_gpu->data().get(),
921                             matrix->column_indices->data().get(),
922                             matrixT->values->data().get(),
923                             matrixT->column_indices->data().get(),
924                             matrixT->row_offsets->data().get(),
925                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
926 
927     /* assign the pointer */
928     matstructT->mat = matrixT;
929     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
930   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
931     /* First convert HYB to CSR */
932     CsrMatrix *temp= new CsrMatrix;
933     temp->num_rows = A->rmap->n;
934     temp->num_cols = A->cmap->n;
935     temp->num_entries = a->nz;
936     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
937     temp->column_indices = new THRUSTINTARRAY32(a->nz);
938     temp->values = new THRUSTARRAY(a->nz);
939 
940 
941     stat = cusparse_hyb2csr(cusparsestruct->handle,
942                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
943                             temp->values->data().get(),
944                             temp->row_offsets->data().get(),
945                             temp->column_indices->data().get());CHKERRCUDA(stat);
946 
947     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
948     CsrMatrix *tempT= new CsrMatrix;
949     tempT->num_rows = A->rmap->n;
950     tempT->num_cols = A->cmap->n;
951     tempT->num_entries = a->nz;
952     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
953     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
954     tempT->values = new THRUSTARRAY(a->nz);
955 
956     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
957                             temp->num_cols, temp->num_entries,
958                             temp->values->data().get(),
959                             temp->row_offsets->data().get(),
960                             temp->column_indices->data().get(),
961                             tempT->values->data().get(),
962                             tempT->column_indices->data().get(),
963                             tempT->row_offsets->data().get(),
964                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
965 
966     /* Last, convert CSC to HYB */
967     cusparseHybMat_t hybMat;
968     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
969     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
970       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
971     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
972                             matstructT->descr, tempT->values->data().get(),
973                             tempT->row_offsets->data().get(),
974                             tempT->column_indices->data().get(),
975                             hybMat, 0, partition);CHKERRCUDA(stat);
976 
977     /* assign the pointer */
978     matstructT->mat = hybMat;
979     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
980 
981     /* delete temporaries */
982     if (tempT) {
983       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
984       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
985       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
986       delete (CsrMatrix*) tempT;
987     }
988     if (temp) {
989       if (temp->values) delete (THRUSTARRAY*) temp->values;
990       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
991       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
992       delete (CsrMatrix*) temp;
993     }
994   }
995   /* assign the compressed row indices */
996   matstructT->cprowIndices = new THRUSTINTARRAY;
997   matstructT->cprowIndices->resize(A->cmap->n);
998   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
999   /* assign the pointer */
1000   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1005 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1006 {
1007   PetscInt                              n = xx->map->n;
1008   const PetscScalar                     *barray;
1009   PetscScalar                           *xarray;
1010   thrust::device_ptr<const PetscScalar> bGPU;
1011   thrust::device_ptr<PetscScalar>       xGPU;
1012   cusparseStatus_t                      stat;
1013   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1014   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1017   PetscErrorCode                        ierr;
1018 
1019   PetscFunctionBegin;
1020   /* Analyze the matrix and create the transpose ... on the fly */
1021   if (!loTriFactorT && !upTriFactorT) {
1022     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1023     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1024     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1025   }
1026 
1027   /* Get the GPU pointers */
1028   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1029   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1030   xGPU = thrust::device_pointer_cast(xarray);
1031   bGPU = thrust::device_pointer_cast(barray);
1032 
1033   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1034   /* First, reorder with the row permutation */
1035   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1036                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1037                xGPU);
1038 
1039   /* First, solve U */
1040   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1041                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1042                         upTriFactorT->csrMat->values->data().get(),
1043                         upTriFactorT->csrMat->row_offsets->data().get(),
1044                         upTriFactorT->csrMat->column_indices->data().get(),
1045                         upTriFactorT->solveInfo,
1046                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1047 
1048   /* Then, solve L */
1049   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1050                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1051                         loTriFactorT->csrMat->values->data().get(),
1052                         loTriFactorT->csrMat->row_offsets->data().get(),
1053                         loTriFactorT->csrMat->column_indices->data().get(),
1054                         loTriFactorT->solveInfo,
1055                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1056 
1057   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1058   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1059                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1060                tempGPU->begin());
1061 
1062   /* Copy the temporary to the full solution. */
1063   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1064 
1065   /* restore */
1066   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1067   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1068   ierr = WaitForGPU();CHKERRCUDA(ierr);
1069   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1070   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1075 {
1076   const PetscScalar                 *barray;
1077   PetscScalar                       *xarray;
1078   cusparseStatus_t                  stat;
1079   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1080   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1081   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1082   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1083   PetscErrorCode                    ierr;
1084 
1085   PetscFunctionBegin;
1086   /* Analyze the matrix and create the transpose ... on the fly */
1087   if (!loTriFactorT && !upTriFactorT) {
1088     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1089     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1090     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1091   }
1092 
1093   /* Get the GPU pointers */
1094   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1095   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1096 
1097   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1098   /* First, solve U */
1099   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1100                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1101                         upTriFactorT->csrMat->values->data().get(),
1102                         upTriFactorT->csrMat->row_offsets->data().get(),
1103                         upTriFactorT->csrMat->column_indices->data().get(),
1104                         upTriFactorT->solveInfo,
1105                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1106 
1107   /* Then, solve L */
1108   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1109                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1110                         loTriFactorT->csrMat->values->data().get(),
1111                         loTriFactorT->csrMat->row_offsets->data().get(),
1112                         loTriFactorT->csrMat->column_indices->data().get(),
1113                         loTriFactorT->solveInfo,
1114                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1115 
1116   /* restore */
1117   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1118   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1119   ierr = WaitForGPU();CHKERRCUDA(ierr);
1120   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1121   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1126 {
1127   const PetscScalar                     *barray;
1128   PetscScalar                           *xarray;
1129   thrust::device_ptr<const PetscScalar> bGPU;
1130   thrust::device_ptr<PetscScalar>       xGPU;
1131   cusparseStatus_t                      stat;
1132   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1133   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1134   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1135   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1136   PetscErrorCode                        ierr;
1137 
1138   PetscFunctionBegin;
1139 
1140   /* Get the GPU pointers */
1141   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1142   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1143   xGPU = thrust::device_pointer_cast(xarray);
1144   bGPU = thrust::device_pointer_cast(barray);
1145 
1146   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1147   /* First, reorder with the row permutation */
1148   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1149                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1150                tempGPU->begin());
1151 
1152   /* Next, solve L */
1153   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1154                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1155                         loTriFactor->csrMat->values->data().get(),
1156                         loTriFactor->csrMat->row_offsets->data().get(),
1157                         loTriFactor->csrMat->column_indices->data().get(),
1158                         loTriFactor->solveInfo,
1159                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1160 
1161   /* Then, solve U */
1162   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1163                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1164                         upTriFactor->csrMat->values->data().get(),
1165                         upTriFactor->csrMat->row_offsets->data().get(),
1166                         upTriFactor->csrMat->column_indices->data().get(),
1167                         upTriFactor->solveInfo,
1168                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1169 
1170   /* Last, reorder with the column permutation */
1171   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1172                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1173                xGPU);
1174 
1175   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1176   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1177   ierr = WaitForGPU();CHKERRCUDA(ierr);
1178   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1179   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1184 {
1185   const PetscScalar                 *barray;
1186   PetscScalar                       *xarray;
1187   cusparseStatus_t                  stat;
1188   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1189   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1190   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1191   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1192   PetscErrorCode                    ierr;
1193 
1194   PetscFunctionBegin;
1195   /* Get the GPU pointers */
1196   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1197   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1198 
1199   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1200   /* First, solve L */
1201   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1202                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1203                         loTriFactor->csrMat->values->data().get(),
1204                         loTriFactor->csrMat->row_offsets->data().get(),
1205                         loTriFactor->csrMat->column_indices->data().get(),
1206                         loTriFactor->solveInfo,
1207                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1208 
1209   /* Next, solve U */
1210   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1211                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1212                         upTriFactor->csrMat->values->data().get(),
1213                         upTriFactor->csrMat->row_offsets->data().get(),
1214                         upTriFactor->csrMat->column_indices->data().get(),
1215                         upTriFactor->solveInfo,
1216                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1217 
1218   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1219   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1220   ierr = WaitForGPU();CHKERRCUDA(ierr);
1221   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1222   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1227 {
1228   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1229   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1230   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1231   PetscInt                     m = A->rmap->n,*ii,*ridx;
1232   PetscErrorCode               ierr;
1233   cusparseStatus_t             stat;
1234   cudaError_t                  err;
1235 
1236   PetscFunctionBegin;
1237   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1238     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1239     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1240       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1241       /* copy values only */
1242       matrix->values->assign(a->a, a->a+a->nz);
1243       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1244     } else {
1245       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1246       try {
1247         cusparsestruct->nonzerorow=0;
1248         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1249 
1250         if (a->compressedrow.use) {
1251           m    = a->compressedrow.nrows;
1252           ii   = a->compressedrow.i;
1253           ridx = a->compressedrow.rindex;
1254         } else {
1255           /* Forcing compressed row on the GPU */
1256           int k=0;
1257           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1258           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1259           ii[0]=0;
1260           for (int j = 0; j<m; j++) {
1261             if ((a->i[j+1]-a->i[j])>0) {
1262               ii[k]  = a->i[j];
1263               ridx[k]= j;
1264               k++;
1265             }
1266           }
1267           ii[cusparsestruct->nonzerorow] = a->nz;
1268           m = cusparsestruct->nonzerorow;
1269         }
1270 
1271         /* allocate space for the triangular factor information */
1272         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1273         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1274         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1275         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1276 
1277         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1278         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1279         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1280         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1281         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1282         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1283         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1284 
1285         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1286         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1287           /* set the matrix */
1288           CsrMatrix *matrix= new CsrMatrix;
1289           matrix->num_rows = m;
1290           matrix->num_cols = A->cmap->n;
1291           matrix->num_entries = a->nz;
1292           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1293           matrix->row_offsets->assign(ii, ii + m+1);
1294 
1295           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1296           matrix->column_indices->assign(a->j, a->j+a->nz);
1297 
1298           matrix->values = new THRUSTARRAY(a->nz);
1299           matrix->values->assign(a->a, a->a+a->nz);
1300 
1301           /* assign the pointer */
1302           matstruct->mat = matrix;
1303 
1304         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1305           CsrMatrix *matrix= new CsrMatrix;
1306           matrix->num_rows = m;
1307           matrix->num_cols = A->cmap->n;
1308           matrix->num_entries = a->nz;
1309           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1310           matrix->row_offsets->assign(ii, ii + m+1);
1311 
1312           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1313           matrix->column_indices->assign(a->j, a->j+a->nz);
1314 
1315           matrix->values = new THRUSTARRAY(a->nz);
1316           matrix->values->assign(a->a, a->a+a->nz);
1317 
1318           cusparseHybMat_t hybMat;
1319           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1320           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1321             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1322           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1323               matstruct->descr, matrix->values->data().get(),
1324               matrix->row_offsets->data().get(),
1325               matrix->column_indices->data().get(),
1326               hybMat, 0, partition);CHKERRCUDA(stat);
1327           /* assign the pointer */
1328           matstruct->mat = hybMat;
1329 
1330           if (matrix) {
1331             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1332             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1333             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1334             delete (CsrMatrix*)matrix;
1335           }
1336         }
1337 
1338         /* assign the compressed row indices */
1339         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1340         matstruct->cprowIndices->assign(ridx,ridx+m);
1341         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1342 
1343         /* assign the pointer */
1344         cusparsestruct->mat = matstruct;
1345 
1346         if (!a->compressedrow.use) {
1347           ierr = PetscFree(ii);CHKERRQ(ierr);
1348           ierr = PetscFree(ridx);CHKERRQ(ierr);
1349         }
1350         cusparsestruct->workVector = new THRUSTARRAY(m);
1351       } catch(char *ex) {
1352         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1353       }
1354       cusparsestruct->nonzerostate = A->nonzerostate;
1355     }
1356     ierr = WaitForGPU();CHKERRCUDA(ierr);
1357     A->offloadmask = PETSC_OFFLOAD_BOTH;
1358     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1359   }
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 struct VecCUDAPlusEquals
1364 {
1365   template <typename Tuple>
1366   __host__ __device__
1367   void operator()(Tuple t)
1368   {
1369     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1370   }
1371 };
1372 
1373 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1374 {
1375   PetscErrorCode ierr;
1376 
1377   PetscFunctionBegin;
1378   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1383 {
1384   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1385   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1386   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1387   const PetscScalar            *xarray;
1388   PetscScalar                  *yarray;
1389   PetscErrorCode               ierr;
1390   cusparseStatus_t             stat;
1391 
1392   PetscFunctionBegin;
1393   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1394   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1395   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1396   if (!matstructT) {
1397     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1398     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1399   }
1400   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1401   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1402   if (yy->map->n) {
1403     PetscInt                     n = yy->map->n;
1404     cudaError_t                  err;
1405     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1406   }
1407 
1408   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1409   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1410     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1411     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1412                              mat->num_rows, mat->num_cols,
1413                              mat->num_entries, matstructT->alpha, matstructT->descr,
1414                              mat->values->data().get(), mat->row_offsets->data().get(),
1415                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1416                              yarray);CHKERRCUDA(stat);
1417   } else {
1418     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1419     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1420                              matstructT->alpha, matstructT->descr, hybMat,
1421                              xarray, matstructT->beta_zero,
1422                              yarray);CHKERRCUDA(stat);
1423   }
1424   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1425   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1426   ierr = WaitForGPU();CHKERRCUDA(ierr);
1427   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1428   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1429   PetscFunctionReturn(0);
1430 }
1431 
1432 
1433 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1434 {
1435   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1436   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1437   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1438   const PetscScalar            *xarray;
1439   PetscScalar                  *zarray,*dptr,*beta;
1440   PetscErrorCode               ierr;
1441   cusparseStatus_t             stat;
1442   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1443 
1444   PetscFunctionBegin;
1445   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1446   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1447   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1448   try {
1449     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1450     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1451     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1452       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1453     } else {
1454       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1455     }
1456     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1457     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1458 
1459     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1460     /* csr_spmv is multiply add */
1461     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1462       /* here we need to be careful to set the number of rows in the multiply to the
1463          number of compressed rows in the matrix ... which is equivalent to the
1464          size of the workVector */
1465       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1466       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1467                                mat->num_rows, mat->num_cols,
1468                                mat->num_entries, matstruct->alpha, matstruct->descr,
1469                                mat->values->data().get(), mat->row_offsets->data().get(),
1470                                mat->column_indices->data().get(), xarray, beta,
1471                                dptr);CHKERRCUDA(stat);
1472     } else {
1473       if (cusparsestruct->workVector->size()) {
1474         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1475         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1476                                  matstruct->alpha, matstruct->descr, hybMat,
1477                                  xarray, beta,
1478                                  dptr);CHKERRCUDA(stat);
1479       }
1480     }
1481     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1482 
1483     if (yy) {
1484       if (dptr != zarray) {
1485         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1486       } else if (zz != yy) {
1487         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1488       }
1489     } else if (dptr != zarray) {
1490       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1491     }
1492     /* scatter the data from the temporary into the full vector with a += operation */
1493     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1494     if (dptr != zarray) {
1495       thrust::device_ptr<PetscScalar> zptr;
1496 
1497       zptr = thrust::device_pointer_cast(zarray);
1498       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1499                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1500                        VecCUDAPlusEquals());
1501     }
1502     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1503     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1504     if (yy && !cmpr) {
1505       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1506     } else {
1507       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1508     }
1509   } catch(char *ex) {
1510     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1511   }
1512   ierr = WaitForGPU();CHKERRCUDA(ierr);
1513   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1514   PetscFunctionReturn(0);
1515 }
1516 
1517 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1518 {
1519   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1520   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1521   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1522   const PetscScalar               *xarray;
1523   PetscScalar                     *zarray,*dptr,*beta;
1524   PetscErrorCode                  ierr;
1525   cusparseStatus_t                stat;
1526 
1527   PetscFunctionBegin;
1528   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1529   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1530   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1531   if (!matstructT) {
1532     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1533     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1534   }
1535 
1536   try {
1537     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1538     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1539     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1540     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1541     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1542 
1543     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1544     /* multiply add with matrix transpose */
1545     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1546       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1547       /* here we need to be careful to set the number of rows in the multiply to the
1548          number of compressed rows in the matrix ... which is equivalent to the
1549          size of the workVector */
1550       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1551                                mat->num_rows, mat->num_cols,
1552                                mat->num_entries, matstructT->alpha, matstructT->descr,
1553                                mat->values->data().get(), mat->row_offsets->data().get(),
1554                                mat->column_indices->data().get(), xarray, beta,
1555                                dptr);CHKERRCUDA(stat);
1556     } else {
1557       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1558       if (cusparsestruct->workVector->size()) {
1559         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1560                                  matstructT->alpha, matstructT->descr, hybMat,
1561                                  xarray, beta,
1562                                  dptr);CHKERRCUDA(stat);
1563       }
1564     }
1565     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1566 
1567     if (dptr != zarray) {
1568       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1569     } else if (zz != yy) {
1570       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1571     }
1572     /* scatter the data from the temporary into the full vector with a += operation */
1573     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1574     if (dptr != zarray) {
1575       thrust::device_ptr<PetscScalar> zptr;
1576 
1577       zptr = thrust::device_pointer_cast(zarray);
1578 
1579       /* scatter the data from the temporary into the full vector with a += operation */
1580       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1581                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1582                        VecCUDAPlusEquals());
1583     }
1584     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1585     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1586     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1587   } catch(char *ex) {
1588     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1589   }
1590   ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1591   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1596 {
1597   PetscErrorCode ierr;
1598 
1599   PetscFunctionBegin;
1600   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1601   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1602   if (A->factortype == MAT_FACTOR_NONE) {
1603     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1604   }
1605   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1606   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1607   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1608   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 /* --------------------------------------------------------------------------------*/
1613 /*@
1614    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1615    (the default parallel PETSc format). This matrix will ultimately pushed down
1616    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1617    assembly performance the user should preallocate the matrix storage by setting
1618    the parameter nz (or the array nnz).  By setting these parameters accurately,
1619    performance during matrix assembly can be increased by more than a factor of 50.
1620 
1621    Collective
1622 
1623    Input Parameters:
1624 +  comm - MPI communicator, set to PETSC_COMM_SELF
1625 .  m - number of rows
1626 .  n - number of columns
1627 .  nz - number of nonzeros per row (same for all rows)
1628 -  nnz - array containing the number of nonzeros in the various rows
1629          (possibly different for each row) or NULL
1630 
1631    Output Parameter:
1632 .  A - the matrix
1633 
1634    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1635    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1636    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1637 
1638    Notes:
1639    If nnz is given then nz is ignored
1640 
1641    The AIJ format (also called the Yale sparse matrix format or
1642    compressed row storage), is fully compatible with standard Fortran 77
1643    storage.  That is, the stored row and column indices can begin at
1644    either one (as in Fortran) or zero.  See the users' manual for details.
1645 
1646    Specify the preallocated storage with either nz or nnz (not both).
1647    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1648    allocation.  For large problems you MUST preallocate memory or you
1649    will get TERRIBLE performance, see the users' manual chapter on matrices.
1650 
1651    By default, this format uses inodes (identical nodes) when possible, to
1652    improve numerical efficiency of matrix-vector products and solves. We
1653    search for consecutive rows with the same nonzero structure, thereby
1654    reusing matrix information to achieve increased efficiency.
1655 
1656    Level: intermediate
1657 
1658 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1659 @*/
1660 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1661 {
1662   PetscErrorCode ierr;
1663 
1664   PetscFunctionBegin;
1665   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1666   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1667   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1668   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1669   PetscFunctionReturn(0);
1670 }
1671 
1672 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1673 {
1674   PetscErrorCode   ierr;
1675 
1676   PetscFunctionBegin;
1677   if (A->factortype==MAT_FACTOR_NONE) {
1678     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1679       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1680     }
1681   } else {
1682     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1683   }
1684   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1685   PetscFunctionReturn(0);
1686 }
1687 
1688 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1689 {
1690   PetscErrorCode ierr;
1691   Mat C;
1692   cusparseStatus_t stat;
1693   cusparseHandle_t handle=0;
1694 
1695   PetscFunctionBegin;
1696   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1697   C    = *B;
1698   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1699   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1700 
1701   /* inject CUSPARSE-specific stuff */
1702   if (C->factortype==MAT_FACTOR_NONE) {
1703     /* you cannot check the inode.use flag here since the matrix was just created.
1704        now build a GPU matrix data structure */
1705     C->spptr = new Mat_SeqAIJCUSPARSE;
1706     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1707     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1708     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1709     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1710     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1711     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1712     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1713     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1714   } else {
1715     /* NEXT, set the pointers to the triangular factors */
1716     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1717     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1718     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1719     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1725     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1726     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1727     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1728   }
1729 
1730   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1731   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1732   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1733   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1734   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1735   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1736   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1737   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1738 
1739   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1740 
1741   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1742 
1743   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1744   PetscFunctionReturn(0);
1745 }
1746 
1747 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1748 {
1749   PetscErrorCode ierr;
1750   cusparseStatus_t stat;
1751   cusparseHandle_t handle=0;
1752 
1753   PetscFunctionBegin;
1754   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1755   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1756 
1757   if (B->factortype==MAT_FACTOR_NONE) {
1758     /* you cannot check the inode.use flag here since the matrix was just created.
1759        now build a GPU matrix data structure */
1760     B->spptr = new Mat_SeqAIJCUSPARSE;
1761     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1762     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1763     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1764     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1765     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1766     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1767     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1768     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1769   } else {
1770     /* NEXT, set the pointers to the triangular factors */
1771     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1772     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1773     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1774     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1775     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1776     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1777     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1778     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1779     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1780     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1781     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1782   }
1783 
1784   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1785   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1786   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1787   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1788   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1789   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1790   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1791   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1792 
1793   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1794 
1795   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1796 
1797   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1798   PetscFunctionReturn(0);
1799 }
1800 
1801 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1802 {
1803   PetscErrorCode ierr;
1804 
1805   PetscFunctionBegin;
1806   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1807   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1808   PetscFunctionReturn(0);
1809 }
1810 
1811 /*MC
1812    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1813 
1814    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1815    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1816    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1817 
1818    Options Database Keys:
1819 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1820 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1821 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1822 
1823   Level: beginner
1824 
1825 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1826 M*/
1827 
1828 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1829 
1830 
1831 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1832 {
1833   PetscErrorCode ierr;
1834 
1835   PetscFunctionBegin;
1836   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1837   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1838   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1839   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1840   PetscFunctionReturn(0);
1841 }
1842 
1843 
1844 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1845 {
1846   cusparseStatus_t stat;
1847   cusparseHandle_t handle;
1848 
1849   PetscFunctionBegin;
1850   if (*cusparsestruct) {
1851     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1852     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1853     delete (*cusparsestruct)->workVector;
1854     if (handle = (*cusparsestruct)->handle) {
1855       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1856     }
1857     delete *cusparsestruct;
1858     *cusparsestruct = 0;
1859   }
1860   PetscFunctionReturn(0);
1861 }
1862 
1863 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1864 {
1865   PetscFunctionBegin;
1866   if (*mat) {
1867     delete (*mat)->values;
1868     delete (*mat)->column_indices;
1869     delete (*mat)->row_offsets;
1870     delete *mat;
1871     *mat = 0;
1872   }
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1877 {
1878   cusparseStatus_t stat;
1879   PetscErrorCode   ierr;
1880 
1881   PetscFunctionBegin;
1882   if (*trifactor) {
1883     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1884     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1885     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1886     delete *trifactor;
1887     *trifactor = 0;
1888   }
1889   PetscFunctionReturn(0);
1890 }
1891 
1892 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1893 {
1894   CsrMatrix        *mat;
1895   cusparseStatus_t stat;
1896   cudaError_t      err;
1897 
1898   PetscFunctionBegin;
1899   if (*matstruct) {
1900     if ((*matstruct)->mat) {
1901       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1902         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1903         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1904       } else {
1905         mat = (CsrMatrix*)(*matstruct)->mat;
1906         CsrMatrix_Destroy(&mat);
1907       }
1908     }
1909     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1910     delete (*matstruct)->cprowIndices;
1911     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1912     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1913     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1914     delete *matstruct;
1915     *matstruct = 0;
1916   }
1917   PetscFunctionReturn(0);
1918 }
1919 
1920 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1921 {
1922   cusparseHandle_t handle;
1923   cusparseStatus_t stat;
1924 
1925   PetscFunctionBegin;
1926   if (*trifactors) {
1927     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1928     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1929     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1930     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1931     delete (*trifactors)->rpermIndices;
1932     delete (*trifactors)->cpermIndices;
1933     delete (*trifactors)->workVector;
1934     if (handle = (*trifactors)->handle) {
1935       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1936     }
1937     delete *trifactors;
1938     *trifactors = 0;
1939   }
1940   PetscFunctionReturn(0);
1941 }
1942 
1943