xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 7c700b8da5d6204518062b989b69a232d6873208)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42 
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55 {
56   cusparseStatus_t   stat;
57   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58 
59   PetscFunctionBegin;
60   if (cusparsestruct->handle != handle) {
61     if (cusparsestruct->handle) {
62       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
63     }
64     cusparsestruct->handle = handle;
65   }
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80 {
81   PetscFunctionBegin;
82   *type = MATSOLVERCUSPARSE;
83   PetscFunctionReturn(0);
84 }
85 
86 /*MC
87   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92   algorithms are not recommended. This class does NOT support direct solver operations.
93 
94   Level: beginner
95 
96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97 M*/
98 
99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100 {
101   PetscErrorCode ierr;
102   PetscInt       n = A->rmap->n;
103 
104   PetscFunctionBegin;
105   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
106   (*B)->factortype = ftype;
107   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
108   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
109 
110   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
112     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
118 
119   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
120   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   if (A->factortype==MAT_FACTOR_NONE) {
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
183     }
184   }
185   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187   if (flg) {
188     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248 
249   PetscFunctionBegin;
250   if (!n) PetscFunctionReturn(0);
251   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
252     try {
253       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
254       nzLower=n+ai[n]-ai[1];
255 
256       /* Allocate Space for the lower triangular matrix */
257       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
258       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
260 
261       /* Fill the lower triangular matrix */
262       AiLo[0]  = (PetscInt) 0;
263       AiLo[n]  = nzLower;
264       AjLo[0]  = (PetscInt) 0;
265       AALo[0]  = (MatScalar) 1.0;
266       v        = aa;
267       vi       = aj;
268       offset   = 1;
269       rowOffset= 1;
270       for (i=1; i<n; i++) {
271         nz = ai[i+1] - ai[i];
272         /* additional 1 for the term on the diagonal */
273         AiLo[i]    = rowOffset;
274         rowOffset += nz+1;
275 
276         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
277         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
278 
279         offset      += nz;
280         AjLo[offset] = (PetscInt) i;
281         AALo[offset] = (MatScalar) 1.0;
282         offset      += 1;
283 
284         v  += nz;
285         vi += nz;
286       }
287 
288       /* allocate space for the triangular factor information */
289       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
290 
291       /* Create the matrix description */
292       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
293       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
294       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
295       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
296       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
297 
298       /* Create the solve analysis information */
299       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
300 
301       /* set the operation */
302       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
303 
304       /* set the matrix */
305       loTriFactor->csrMat = new CsrMatrix;
306       loTriFactor->csrMat->num_rows = n;
307       loTriFactor->csrMat->num_cols = n;
308       loTriFactor->csrMat->num_entries = nzLower;
309 
310       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
311       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
312 
313       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
314       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
315 
316       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
317       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
318 
319       /* perform the solve analysis */
320       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
321                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
322                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
323                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
324 
325       /* assign the pointer. Is this really necessary? */
326       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
327 
328       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
329       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
331       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
332     } catch(char *ex) {
333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334     }
335   }
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340 {
341   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342   PetscInt                          n = A->rmap->n;
343   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345   cusparseStatus_t                  stat;
346   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347   const MatScalar                   *aa = a->a,*v;
348   PetscInt                          *AiUp, *AjUp;
349   PetscScalar                       *AAUp;
350   PetscInt                          i,nz, nzUpper, offset;
351   PetscErrorCode                    ierr;
352 
353   PetscFunctionBegin;
354   if (!n) PetscFunctionReturn(0);
355   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->offloadmask = PETSC_OFFLOAD_BOTH;
457   /* lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /* upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if (!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (!n) PetscFunctionReturn(0);
504   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
505     try {
506       /* Allocate Space for the upper triangular matrix */
507       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
511 
512       /* Fill the upper triangular matrix */
513       AiUp[0]=(PetscInt) 0;
514       AiUp[n]=nzUpper;
515       offset = 0;
516       for (i=0; i<n; i++) {
517         /* set the pointers */
518         v  = aa + ai[i];
519         vj = aj + ai[i];
520         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
521 
522         /* first, set the diagonal elements */
523         AjUp[offset] = (PetscInt) i;
524         AAUp[offset] = (MatScalar)1.0/v[nz];
525         AiUp[i]      = offset;
526         AALo[offset] = (MatScalar)1.0/v[nz];
527 
528         offset+=1;
529         if (nz>0) {
530           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
531           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
532           for (j=offset; j<offset+nz; j++) {
533             AAUp[j] = -AAUp[j];
534             AALo[j] = AAUp[j]/v[nz];
535           }
536           offset+=nz;
537         }
538       }
539 
540       /* allocate space for the triangular factor information */
541       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
542 
543       /* Create the matrix description */
544       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
545       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
546       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
547       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
548       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
549 
550       /* Create the solve analysis information */
551       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
552 
553       /* set the operation */
554       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
555 
556       /* set the matrix */
557       upTriFactor->csrMat = new CsrMatrix;
558       upTriFactor->csrMat->num_rows = A->rmap->n;
559       upTriFactor->csrMat->num_cols = A->cmap->n;
560       upTriFactor->csrMat->num_entries = a->nz;
561 
562       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
563       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
564 
565       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
566       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
567 
568       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
569       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
570 
571       /* perform the solve analysis */
572       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
576 
577       /* assign the pointer. Is this really necessary? */
578       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
579 
580       /* allocate space for the triangular factor information */
581       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
582 
583       /* Create the matrix description */
584       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
585       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
586       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
587       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
588       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
589 
590       /* Create the solve analysis information */
591       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
592 
593       /* set the operation */
594       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
595 
596       /* set the matrix */
597       loTriFactor->csrMat = new CsrMatrix;
598       loTriFactor->csrMat->num_rows = A->rmap->n;
599       loTriFactor->csrMat->num_cols = A->cmap->n;
600       loTriFactor->csrMat->num_entries = a->nz;
601 
602       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
603       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
604 
605       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
606       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
607 
608       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
610       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
611 
612       /* perform the solve analysis */
613       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
614                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
615                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
616                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
617 
618       /* assign the pointer. Is this really necessary? */
619       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
620 
621       A->offloadmask = PETSC_OFFLOAD_BOTH;
622       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
625       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
626     } catch(char *ex) {
627       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628     }
629   }
630   PetscFunctionReturn(0);
631 }
632 
633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
634 {
635   PetscErrorCode               ierr;
636   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
637   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638   IS                           ip = a->row;
639   const PetscInt               *rip;
640   PetscBool                    perm_identity;
641   PetscInt                     n = A->rmap->n;
642 
643   PetscFunctionBegin;
644   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
645   cusparseTriFactors->workVector = new THRUSTARRAY(n);
646   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
647 
648   /* lower triangular indices */
649   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
650   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
651   if (!perm_identity) {
652     IS             iip;
653     const PetscInt *irip;
654 
655     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
656     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
657     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
659     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
660     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
661     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
662     ierr = ISDestroy(&iip);CHKERRQ(ierr);
663     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
664   }
665   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
670 {
671   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
672   IS             isrow = b->row,iscol = b->col;
673   PetscBool      row_identity,col_identity;
674   PetscErrorCode ierr;
675 
676   PetscFunctionBegin;
677   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
678   /* determine which version of MatSolve needs to be used. */
679   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
680   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
681   if (row_identity && col_identity) {
682     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
683     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
684     B->ops->matsolve = NULL;
685     B->ops->matsolvetranspose = NULL;
686   } else {
687     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
688     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
689     B->ops->matsolve = NULL;
690     B->ops->matsolvetranspose = NULL;
691   }
692 
693   /* get the triangular factors */
694   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
699 {
700   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
701   IS             ip = b->row;
702   PetscBool      perm_identity;
703   PetscErrorCode ierr;
704 
705   PetscFunctionBegin;
706   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
707 
708   /* determine which version of MatSolve needs to be used. */
709   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
710   if (perm_identity) {
711     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
712     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
713     B->ops->matsolve = NULL;
714     B->ops->matsolvetranspose = NULL;
715   } else {
716     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
717     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
718     B->ops->matsolve = NULL;
719     B->ops->matsolvetranspose = NULL;
720   }
721 
722   /* get the triangular factors */
723   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
724   PetscFunctionReturn(0);
725 }
726 
727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
728 {
729   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
730   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
731   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
732   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
733   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
734   cusparseStatus_t                  stat;
735   cusparseIndexBase_t               indexBase;
736   cusparseMatrixType_t              matrixType;
737   cusparseFillMode_t                fillMode;
738   cusparseDiagType_t                diagType;
739 
740   PetscFunctionBegin;
741 
742   /*********************************************/
743   /* Now the Transpose of the Lower Tri Factor */
744   /*********************************************/
745 
746   /* allocate space for the transpose of the lower triangular factor */
747   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
748 
749   /* set the matrix descriptors of the lower triangular factor */
750   matrixType = cusparseGetMatType(loTriFactor->descr);
751   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
752   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
753     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
754   diagType = cusparseGetMatDiagType(loTriFactor->descr);
755 
756   /* Create the matrix description */
757   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
758   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
759   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
760   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
761   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
762 
763   /* Create the solve analysis information */
764   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
765 
766   /* set the operation */
767   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768 
769   /* allocate GPU space for the CSC of the lower triangular factor*/
770   loTriFactorT->csrMat = new CsrMatrix;
771   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
772   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
773   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
774   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
775   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
776   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
777 
778   /* compute the transpose of the lower triangular factor, i.e. the CSC */
779   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
780                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
781                           loTriFactor->csrMat->values->data().get(),
782                           loTriFactor->csrMat->row_offsets->data().get(),
783                           loTriFactor->csrMat->column_indices->data().get(),
784                           loTriFactorT->csrMat->values->data().get(),
785                           loTriFactorT->csrMat->column_indices->data().get(),
786                           loTriFactorT->csrMat->row_offsets->data().get(),
787                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
788 
789   /* perform the solve analysis on the transposed matrix */
790   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
791                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
792                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
793                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
794                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
795 
796   /* assign the pointer. Is this really necessary? */
797   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
798 
799   /*********************************************/
800   /* Now the Transpose of the Upper Tri Factor */
801   /*********************************************/
802 
803   /* allocate space for the transpose of the upper triangular factor */
804   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
805 
806   /* set the matrix descriptors of the upper triangular factor */
807   matrixType = cusparseGetMatType(upTriFactor->descr);
808   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
809   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
810     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
811   diagType = cusparseGetMatDiagType(upTriFactor->descr);
812 
813   /* Create the matrix description */
814   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
815   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
816   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
817   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
818   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
819 
820   /* Create the solve analysis information */
821   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
822 
823   /* set the operation */
824   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
825 
826   /* allocate GPU space for the CSC of the upper triangular factor*/
827   upTriFactorT->csrMat = new CsrMatrix;
828   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
829   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
830   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
831   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
832   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
833   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
834 
835   /* compute the transpose of the upper triangular factor, i.e. the CSC */
836   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
837                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
838                           upTriFactor->csrMat->values->data().get(),
839                           upTriFactor->csrMat->row_offsets->data().get(),
840                           upTriFactor->csrMat->column_indices->data().get(),
841                           upTriFactorT->csrMat->values->data().get(),
842                           upTriFactorT->csrMat->column_indices->data().get(),
843                           upTriFactorT->csrMat->row_offsets->data().get(),
844                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
845 
846   /* perform the solve analysis on the transposed matrix */
847   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
848                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
849                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
850                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
851                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
852 
853   /* assign the pointer. Is this really necessary? */
854   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
855   PetscFunctionReturn(0);
856 }
857 
858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
859 {
860   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
861   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
862   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
863   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
864   cusparseStatus_t             stat;
865   cusparseIndexBase_t          indexBase;
866   cudaError_t                  err;
867   PetscErrorCode               ierr;
868 
869   PetscFunctionBegin;
870 
871   /* allocate space for the triangular factor information */
872   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
873   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
874   indexBase = cusparseGetMatIndexBase(matstruct->descr);
875   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
876   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
877 
878   /* set alpha and beta */
879   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
880   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
881   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
882   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
883   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
885   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
886 
887   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
888     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
889     CsrMatrix *matrixT= new CsrMatrix;
890     thrust::device_vector<int> *rowoffsets_gpu;
891     const int ncomprow = matstruct->cprowIndices->size();
892     thrust::host_vector<int> cprow(ncomprow);
893     cprow = *matstruct->cprowIndices; // GPU --> CPU
894     matrixT->num_rows = A->cmap->n;
895     matrixT->num_cols = A->rmap->n;
896     matrixT->num_entries = a->nz;
897     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
898     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
899     matrixT->values = new THRUSTARRAY(a->nz);
900     PetscInt k,i;
901     thrust::host_vector<int> rowst_host(A->rmap->n+1);
902 
903     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
904     rowst_host[0] = 0;
905     for (k = 0, i = 0; i < A->rmap->n ; i++) {
906       if (k < ncomprow && i==cprow[k]) {
907 	rowst_host[i+1] = a->i[i+1];
908 	k++;
909       } else rowst_host[i+1] = rowst_host[i];
910     }
911     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
912     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
913     *rowoffsets_gpu = rowst_host; // CPU --> GPU
914 
915     /* compute the transpose of the upper triangular factor, i.e. the CSC */
916     indexBase = cusparseGetMatIndexBase(matstruct->descr);
917     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
918                             A->cmap->n, matrix->num_entries,
919                             matrix->values->data().get(),
920                             rowoffsets_gpu->data().get(),
921                             matrix->column_indices->data().get(),
922                             matrixT->values->data().get(),
923                             matrixT->column_indices->data().get(),
924                             matrixT->row_offsets->data().get(),
925                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
926 
927     /* assign the pointer */
928     matstructT->mat = matrixT;
929     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
930   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
931     /* First convert HYB to CSR */
932     CsrMatrix *temp= new CsrMatrix;
933     temp->num_rows = A->rmap->n;
934     temp->num_cols = A->cmap->n;
935     temp->num_entries = a->nz;
936     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
937     temp->column_indices = new THRUSTINTARRAY32(a->nz);
938     temp->values = new THRUSTARRAY(a->nz);
939 
940 
941     stat = cusparse_hyb2csr(cusparsestruct->handle,
942                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
943                             temp->values->data().get(),
944                             temp->row_offsets->data().get(),
945                             temp->column_indices->data().get());CHKERRCUDA(stat);
946 
947     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
948     CsrMatrix *tempT= new CsrMatrix;
949     tempT->num_rows = A->rmap->n;
950     tempT->num_cols = A->cmap->n;
951     tempT->num_entries = a->nz;
952     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
953     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
954     tempT->values = new THRUSTARRAY(a->nz);
955 
956     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
957                             temp->num_cols, temp->num_entries,
958                             temp->values->data().get(),
959                             temp->row_offsets->data().get(),
960                             temp->column_indices->data().get(),
961                             tempT->values->data().get(),
962                             tempT->column_indices->data().get(),
963                             tempT->row_offsets->data().get(),
964                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
965 
966     /* Last, convert CSC to HYB */
967     cusparseHybMat_t hybMat;
968     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
969     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
970       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
971     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
972                             matstructT->descr, tempT->values->data().get(),
973                             tempT->row_offsets->data().get(),
974                             tempT->column_indices->data().get(),
975                             hybMat, 0, partition);CHKERRCUDA(stat);
976 
977     /* assign the pointer */
978     matstructT->mat = hybMat;
979     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
980 
981     /* delete temporaries */
982     if (tempT) {
983       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
984       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
985       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
986       delete (CsrMatrix*) tempT;
987     }
988     if (temp) {
989       if (temp->values) delete (THRUSTARRAY*) temp->values;
990       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
991       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
992       delete (CsrMatrix*) temp;
993     }
994   }
995   /* assign the compressed row indices */
996   matstructT->cprowIndices = new THRUSTINTARRAY;
997   matstructT->cprowIndices->resize(A->cmap->n);
998   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
999   /* assign the pointer */
1000   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1005 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1006 {
1007   PetscInt                              n = xx->map->n;
1008   const PetscScalar                     *barray;
1009   PetscScalar                           *xarray;
1010   thrust::device_ptr<const PetscScalar> bGPU;
1011   thrust::device_ptr<PetscScalar>       xGPU;
1012   cusparseStatus_t                      stat;
1013   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1014   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1017   PetscErrorCode                        ierr;
1018 
1019   PetscFunctionBegin;
1020   /* Analyze the matrix and create the transpose ... on the fly */
1021   if (!loTriFactorT && !upTriFactorT) {
1022     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1023     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1024     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1025   }
1026 
1027   /* Get the GPU pointers */
1028   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1029   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1030   xGPU = thrust::device_pointer_cast(xarray);
1031   bGPU = thrust::device_pointer_cast(barray);
1032 
1033   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1034   /* First, reorder with the row permutation */
1035   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1036                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1037                xGPU);
1038 
1039   /* First, solve U */
1040   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1041                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1042                         upTriFactorT->csrMat->values->data().get(),
1043                         upTriFactorT->csrMat->row_offsets->data().get(),
1044                         upTriFactorT->csrMat->column_indices->data().get(),
1045                         upTriFactorT->solveInfo,
1046                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1047 
1048   /* Then, solve L */
1049   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1050                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1051                         loTriFactorT->csrMat->values->data().get(),
1052                         loTriFactorT->csrMat->row_offsets->data().get(),
1053                         loTriFactorT->csrMat->column_indices->data().get(),
1054                         loTriFactorT->solveInfo,
1055                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1056 
1057   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1058   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1059                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1060                tempGPU->begin());
1061 
1062   /* Copy the temporary to the full solution. */
1063   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1064 
1065   /* restore */
1066   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1067   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1068   ierr = WaitForGPU();CHKERRCUDA(ierr);
1069   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1070   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1075 {
1076   const PetscScalar                 *barray;
1077   PetscScalar                       *xarray;
1078   cusparseStatus_t                  stat;
1079   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1080   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1081   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1082   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1083   PetscErrorCode                    ierr;
1084 
1085   PetscFunctionBegin;
1086   /* Analyze the matrix and create the transpose ... on the fly */
1087   if (!loTriFactorT && !upTriFactorT) {
1088     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1089     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1090     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1091   }
1092 
1093   /* Get the GPU pointers */
1094   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1095   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1096 
1097   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1098   /* First, solve U */
1099   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1100                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1101                         upTriFactorT->csrMat->values->data().get(),
1102                         upTriFactorT->csrMat->row_offsets->data().get(),
1103                         upTriFactorT->csrMat->column_indices->data().get(),
1104                         upTriFactorT->solveInfo,
1105                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1106 
1107   /* Then, solve L */
1108   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1109                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1110                         loTriFactorT->csrMat->values->data().get(),
1111                         loTriFactorT->csrMat->row_offsets->data().get(),
1112                         loTriFactorT->csrMat->column_indices->data().get(),
1113                         loTriFactorT->solveInfo,
1114                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1115 
1116   /* restore */
1117   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1118   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1119   ierr = WaitForGPU();CHKERRCUDA(ierr);
1120   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1121   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1126 {
1127   const PetscScalar                     *barray;
1128   PetscScalar                           *xarray;
1129   thrust::device_ptr<const PetscScalar> bGPU;
1130   thrust::device_ptr<PetscScalar>       xGPU;
1131   cusparseStatus_t                      stat;
1132   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1133   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1134   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1135   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1136   PetscErrorCode                        ierr;
1137 
1138   PetscFunctionBegin;
1139 
1140   /* Get the GPU pointers */
1141   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1142   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1143   xGPU = thrust::device_pointer_cast(xarray);
1144   bGPU = thrust::device_pointer_cast(barray);
1145 
1146   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1147   /* First, reorder with the row permutation */
1148   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1149                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1150                tempGPU->begin());
1151 
1152   /* Next, solve L */
1153   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1154                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1155                         loTriFactor->csrMat->values->data().get(),
1156                         loTriFactor->csrMat->row_offsets->data().get(),
1157                         loTriFactor->csrMat->column_indices->data().get(),
1158                         loTriFactor->solveInfo,
1159                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1160 
1161   /* Then, solve U */
1162   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1163                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1164                         upTriFactor->csrMat->values->data().get(),
1165                         upTriFactor->csrMat->row_offsets->data().get(),
1166                         upTriFactor->csrMat->column_indices->data().get(),
1167                         upTriFactor->solveInfo,
1168                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1169 
1170   /* Last, reorder with the column permutation */
1171   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1172                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1173                xGPU);
1174 
1175   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1176   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1177   ierr = WaitForGPU();CHKERRCUDA(ierr);
1178   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1179   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1184 {
1185   const PetscScalar                 *barray;
1186   PetscScalar                       *xarray;
1187   cusparseStatus_t                  stat;
1188   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1189   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1190   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1191   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1192   PetscErrorCode                    ierr;
1193 
1194   PetscFunctionBegin;
1195   /* Get the GPU pointers */
1196   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1197   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1198 
1199   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1200   /* First, solve L */
1201   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1202                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1203                         loTriFactor->csrMat->values->data().get(),
1204                         loTriFactor->csrMat->row_offsets->data().get(),
1205                         loTriFactor->csrMat->column_indices->data().get(),
1206                         loTriFactor->solveInfo,
1207                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1208 
1209   /* Next, solve U */
1210   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1211                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1212                         upTriFactor->csrMat->values->data().get(),
1213                         upTriFactor->csrMat->row_offsets->data().get(),
1214                         upTriFactor->csrMat->column_indices->data().get(),
1215                         upTriFactor->solveInfo,
1216                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1217 
1218   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1219   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1220   ierr = WaitForGPU();CHKERRCUDA(ierr);
1221   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1222   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1227 {
1228   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1229   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1230   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1231   PetscInt                     m = A->rmap->n,*ii,*ridx;
1232   PetscErrorCode               ierr;
1233   cusparseStatus_t             stat;
1234   cudaError_t                  err;
1235 
1236   PetscFunctionBegin;
1237   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1238     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1239     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1240       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1241       /* copy values only */
1242       matrix->values->assign(a->a, a->a+a->nz);
1243       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1244     } else {
1245       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1246       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1247       delete cusparsestruct->workVector;
1248       try {
1249         cusparsestruct->nonzerorow=0;
1250         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1251 
1252         if (a->compressedrow.use) {
1253           m    = a->compressedrow.nrows;
1254           ii   = a->compressedrow.i;
1255           ridx = a->compressedrow.rindex;
1256         } else {
1257           /* Forcing compressed row on the GPU */
1258           int k=0;
1259           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1260           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1261           ii[0]=0;
1262           for (int j = 0; j<m; j++) {
1263             if ((a->i[j+1]-a->i[j])>0) {
1264               ii[k]  = a->i[j];
1265               ridx[k]= j;
1266               k++;
1267             }
1268           }
1269           ii[cusparsestruct->nonzerorow] = a->nz;
1270           m = cusparsestruct->nonzerorow;
1271         }
1272 
1273         /* allocate space for the triangular factor information */
1274         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1275         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1276         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1277         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1278 
1279         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1280         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1281         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1282         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1283         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1284         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1285         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1286 
1287         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1288         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1289           /* set the matrix */
1290           CsrMatrix *matrix= new CsrMatrix;
1291           matrix->num_rows = m;
1292           matrix->num_cols = A->cmap->n;
1293           matrix->num_entries = a->nz;
1294           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1295           matrix->row_offsets->assign(ii, ii + m+1);
1296 
1297           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1298           matrix->column_indices->assign(a->j, a->j+a->nz);
1299 
1300           matrix->values = new THRUSTARRAY(a->nz);
1301           matrix->values->assign(a->a, a->a+a->nz);
1302 
1303           /* assign the pointer */
1304           matstruct->mat = matrix;
1305 
1306         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1307           CsrMatrix *matrix= new CsrMatrix;
1308           matrix->num_rows = m;
1309           matrix->num_cols = A->cmap->n;
1310           matrix->num_entries = a->nz;
1311           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1312           matrix->row_offsets->assign(ii, ii + m+1);
1313 
1314           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1315           matrix->column_indices->assign(a->j, a->j+a->nz);
1316 
1317           matrix->values = new THRUSTARRAY(a->nz);
1318           matrix->values->assign(a->a, a->a+a->nz);
1319 
1320           cusparseHybMat_t hybMat;
1321           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1322           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1323             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1324           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1325               matstruct->descr, matrix->values->data().get(),
1326               matrix->row_offsets->data().get(),
1327               matrix->column_indices->data().get(),
1328               hybMat, 0, partition);CHKERRCUDA(stat);
1329           /* assign the pointer */
1330           matstruct->mat = hybMat;
1331 
1332           if (matrix) {
1333             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1334             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1335             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1336             delete (CsrMatrix*)matrix;
1337           }
1338         }
1339 
1340         /* assign the compressed row indices */
1341         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1342         matstruct->cprowIndices->assign(ridx,ridx+m);
1343         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1344 
1345         /* assign the pointer */
1346         cusparsestruct->mat = matstruct;
1347 
1348         if (!a->compressedrow.use) {
1349           ierr = PetscFree(ii);CHKERRQ(ierr);
1350           ierr = PetscFree(ridx);CHKERRQ(ierr);
1351         }
1352         cusparsestruct->workVector = new THRUSTARRAY(m);
1353       } catch(char *ex) {
1354         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1355       }
1356       cusparsestruct->nonzerostate = A->nonzerostate;
1357     }
1358     ierr = WaitForGPU();CHKERRCUDA(ierr);
1359     A->offloadmask = PETSC_OFFLOAD_BOTH;
1360     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1361   }
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 struct VecCUDAPlusEquals
1366 {
1367   template <typename Tuple>
1368   __host__ __device__
1369   void operator()(Tuple t)
1370   {
1371     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1372   }
1373 };
1374 
1375 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1376 {
1377   PetscErrorCode ierr;
1378 
1379   PetscFunctionBegin;
1380   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1385 {
1386   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1387   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1388   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1389   const PetscScalar            *xarray;
1390   PetscScalar                  *yarray;
1391   PetscErrorCode               ierr;
1392   cusparseStatus_t             stat;
1393 
1394   PetscFunctionBegin;
1395   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1396   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1397   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1398   if (!matstructT) {
1399     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1400     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1401   }
1402   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1403   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1404   if (yy->map->n) {
1405     PetscInt                     n = yy->map->n;
1406     cudaError_t                  err;
1407     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1408   }
1409 
1410   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1411   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1412     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1413     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414                              mat->num_rows, mat->num_cols,
1415                              mat->num_entries, matstructT->alpha, matstructT->descr,
1416                              mat->values->data().get(), mat->row_offsets->data().get(),
1417                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1418                              yarray);CHKERRCUDA(stat);
1419   } else {
1420     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1421     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1422                              matstructT->alpha, matstructT->descr, hybMat,
1423                              xarray, matstructT->beta_zero,
1424                              yarray);CHKERRCUDA(stat);
1425   }
1426   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1427   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1428   ierr = WaitForGPU();CHKERRCUDA(ierr);
1429   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1430   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 
1435 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1436 {
1437   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1438   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1439   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1440   const PetscScalar            *xarray;
1441   PetscScalar                  *zarray,*dptr,*beta;
1442   PetscErrorCode               ierr;
1443   cusparseStatus_t             stat;
1444   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1445 
1446   PetscFunctionBegin;
1447   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1448   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1449   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1450   try {
1451     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1452     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1453     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1454       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1455     } else {
1456       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1457     }
1458     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1459     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1460 
1461     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1462     /* csr_spmv is multiply add */
1463     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1464       /* here we need to be careful to set the number of rows in the multiply to the
1465          number of compressed rows in the matrix ... which is equivalent to the
1466          size of the workVector */
1467       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1468       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1469                                mat->num_rows, mat->num_cols,
1470                                mat->num_entries, matstruct->alpha, matstruct->descr,
1471                                mat->values->data().get(), mat->row_offsets->data().get(),
1472                                mat->column_indices->data().get(), xarray, beta,
1473                                dptr);CHKERRCUDA(stat);
1474     } else {
1475       if (cusparsestruct->workVector->size()) {
1476         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1477         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1478                                  matstruct->alpha, matstruct->descr, hybMat,
1479                                  xarray, beta,
1480                                  dptr);CHKERRCUDA(stat);
1481       }
1482     }
1483     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1484 
1485     if (yy) {
1486       if (dptr != zarray) {
1487         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1488       } else if (zz != yy) {
1489         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1490       }
1491     } else if (dptr != zarray) {
1492       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1493     }
1494     /* scatter the data from the temporary into the full vector with a += operation */
1495     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1496     if (dptr != zarray) {
1497       thrust::device_ptr<PetscScalar> zptr;
1498 
1499       zptr = thrust::device_pointer_cast(zarray);
1500       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1501                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1502                        VecCUDAPlusEquals());
1503     }
1504     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1505     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1506     if (yy && !cmpr) {
1507       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1508     } else {
1509       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1510     }
1511   } catch(char *ex) {
1512     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1513   }
1514   ierr = WaitForGPU();CHKERRCUDA(ierr);
1515   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1520 {
1521   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1522   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1523   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1524   const PetscScalar               *xarray;
1525   PetscScalar                     *zarray,*dptr,*beta;
1526   PetscErrorCode                  ierr;
1527   cusparseStatus_t                stat;
1528 
1529   PetscFunctionBegin;
1530   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1531   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1532   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1533   if (!matstructT) {
1534     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1535     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1536   }
1537 
1538   try {
1539     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1540     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1541     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1542     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1543     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1544 
1545     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1546     /* multiply add with matrix transpose */
1547     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1548       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1549       /* here we need to be careful to set the number of rows in the multiply to the
1550          number of compressed rows in the matrix ... which is equivalent to the
1551          size of the workVector */
1552       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1553                                mat->num_rows, mat->num_cols,
1554                                mat->num_entries, matstructT->alpha, matstructT->descr,
1555                                mat->values->data().get(), mat->row_offsets->data().get(),
1556                                mat->column_indices->data().get(), xarray, beta,
1557                                dptr);CHKERRCUDA(stat);
1558     } else {
1559       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1560       if (cusparsestruct->workVector->size()) {
1561         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1562                                  matstructT->alpha, matstructT->descr, hybMat,
1563                                  xarray, beta,
1564                                  dptr);CHKERRCUDA(stat);
1565       }
1566     }
1567     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1568 
1569     if (dptr != zarray) {
1570       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1571     } else if (zz != yy) {
1572       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1573     }
1574     /* scatter the data from the temporary into the full vector with a += operation */
1575     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1576     if (dptr != zarray) {
1577       thrust::device_ptr<PetscScalar> zptr;
1578 
1579       zptr = thrust::device_pointer_cast(zarray);
1580 
1581       /* scatter the data from the temporary into the full vector with a += operation */
1582       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1583                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1584                        VecCUDAPlusEquals());
1585     }
1586     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1587     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1588     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1589   } catch(char *ex) {
1590     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1591   }
1592   ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1593   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1598 {
1599   PetscErrorCode ierr;
1600 
1601   PetscFunctionBegin;
1602   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1603   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1604   if (A->factortype == MAT_FACTOR_NONE) {
1605     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1606   }
1607   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1608   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1609   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1610   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1611   PetscFunctionReturn(0);
1612 }
1613 
1614 /* --------------------------------------------------------------------------------*/
1615 /*@
1616    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1617    (the default parallel PETSc format). This matrix will ultimately pushed down
1618    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1619    assembly performance the user should preallocate the matrix storage by setting
1620    the parameter nz (or the array nnz).  By setting these parameters accurately,
1621    performance during matrix assembly can be increased by more than a factor of 50.
1622 
1623    Collective
1624 
1625    Input Parameters:
1626 +  comm - MPI communicator, set to PETSC_COMM_SELF
1627 .  m - number of rows
1628 .  n - number of columns
1629 .  nz - number of nonzeros per row (same for all rows)
1630 -  nnz - array containing the number of nonzeros in the various rows
1631          (possibly different for each row) or NULL
1632 
1633    Output Parameter:
1634 .  A - the matrix
1635 
1636    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1637    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1638    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1639 
1640    Notes:
1641    If nnz is given then nz is ignored
1642 
1643    The AIJ format (also called the Yale sparse matrix format or
1644    compressed row storage), is fully compatible with standard Fortran 77
1645    storage.  That is, the stored row and column indices can begin at
1646    either one (as in Fortran) or zero.  See the users' manual for details.
1647 
1648    Specify the preallocated storage with either nz or nnz (not both).
1649    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1650    allocation.  For large problems you MUST preallocate memory or you
1651    will get TERRIBLE performance, see the users' manual chapter on matrices.
1652 
1653    By default, this format uses inodes (identical nodes) when possible, to
1654    improve numerical efficiency of matrix-vector products and solves. We
1655    search for consecutive rows with the same nonzero structure, thereby
1656    reusing matrix information to achieve increased efficiency.
1657 
1658    Level: intermediate
1659 
1660 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1661 @*/
1662 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1663 {
1664   PetscErrorCode ierr;
1665 
1666   PetscFunctionBegin;
1667   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1668   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1669   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1670   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1675 {
1676   PetscErrorCode   ierr;
1677 
1678   PetscFunctionBegin;
1679   if (A->factortype==MAT_FACTOR_NONE) {
1680     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1681       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1682     }
1683   } else {
1684     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1685   }
1686   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1687   PetscFunctionReturn(0);
1688 }
1689 
1690 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1691 {
1692   PetscErrorCode ierr;
1693   Mat C;
1694   cusparseStatus_t stat;
1695   cusparseHandle_t handle=0;
1696 
1697   PetscFunctionBegin;
1698   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1699   C    = *B;
1700   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1701   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1702 
1703   /* inject CUSPARSE-specific stuff */
1704   if (C->factortype==MAT_FACTOR_NONE) {
1705     /* you cannot check the inode.use flag here since the matrix was just created.
1706        now build a GPU matrix data structure */
1707     C->spptr = new Mat_SeqAIJCUSPARSE;
1708     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1709     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1710     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1711     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1712     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1713     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1714     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1715     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1716   } else {
1717     /* NEXT, set the pointers to the triangular factors */
1718     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1719     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1726     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1727     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1728     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1729     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1730   }
1731 
1732   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1733   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1734   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1735   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1736   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1737   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1738   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1739   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1740 
1741   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1742 
1743   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1744 
1745   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1746   PetscFunctionReturn(0);
1747 }
1748 
1749 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1750 {
1751   PetscErrorCode ierr;
1752   cusparseStatus_t stat;
1753   cusparseHandle_t handle=0;
1754 
1755   PetscFunctionBegin;
1756   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1757   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1758 
1759   if (B->factortype==MAT_FACTOR_NONE) {
1760     /* you cannot check the inode.use flag here since the matrix was just created.
1761        now build a GPU matrix data structure */
1762     B->spptr = new Mat_SeqAIJCUSPARSE;
1763     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1764     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1765     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1766     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1767     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1768     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1769     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1770     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1771   } else {
1772     /* NEXT, set the pointers to the triangular factors */
1773     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1774     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1775     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1776     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1777     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1778     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1779     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1780     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1781     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1782     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1783     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1784   }
1785 
1786   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1787   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1788   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1789   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1790   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1791   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1792   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1793   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1794 
1795   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1796 
1797   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1798 
1799   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1800   PetscFunctionReturn(0);
1801 }
1802 
1803 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1804 {
1805   PetscErrorCode ierr;
1806 
1807   PetscFunctionBegin;
1808   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1809   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 /*MC
1814    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1815 
1816    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1817    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1818    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1819 
1820    Options Database Keys:
1821 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1822 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1823 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1824 
1825   Level: beginner
1826 
1827 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1828 M*/
1829 
1830 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1831 
1832 
1833 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1834 {
1835   PetscErrorCode ierr;
1836 
1837   PetscFunctionBegin;
1838   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1839   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1840   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1841   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1842   PetscFunctionReturn(0);
1843 }
1844 
1845 
1846 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1847 {
1848   cusparseStatus_t stat;
1849   cusparseHandle_t handle;
1850 
1851   PetscFunctionBegin;
1852   if (*cusparsestruct) {
1853     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1854     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1855     delete (*cusparsestruct)->workVector;
1856     if (handle = (*cusparsestruct)->handle) {
1857       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1858     }
1859     delete *cusparsestruct;
1860     *cusparsestruct = 0;
1861   }
1862   PetscFunctionReturn(0);
1863 }
1864 
1865 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1866 {
1867   PetscFunctionBegin;
1868   if (*mat) {
1869     delete (*mat)->values;
1870     delete (*mat)->column_indices;
1871     delete (*mat)->row_offsets;
1872     delete *mat;
1873     *mat = 0;
1874   }
1875   PetscFunctionReturn(0);
1876 }
1877 
1878 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1879 {
1880   cusparseStatus_t stat;
1881   PetscErrorCode   ierr;
1882 
1883   PetscFunctionBegin;
1884   if (*trifactor) {
1885     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1886     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1887     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1888     delete *trifactor;
1889     *trifactor = 0;
1890   }
1891   PetscFunctionReturn(0);
1892 }
1893 
1894 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1895 {
1896   CsrMatrix        *mat;
1897   cusparseStatus_t stat;
1898   cudaError_t      err;
1899 
1900   PetscFunctionBegin;
1901   if (*matstruct) {
1902     if ((*matstruct)->mat) {
1903       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1904         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1905         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1906       } else {
1907         mat = (CsrMatrix*)(*matstruct)->mat;
1908         CsrMatrix_Destroy(&mat);
1909       }
1910     }
1911     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1912     delete (*matstruct)->cprowIndices;
1913     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1914     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1915     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1916     delete *matstruct;
1917     *matstruct = 0;
1918   }
1919   PetscFunctionReturn(0);
1920 }
1921 
1922 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1923 {
1924   cusparseHandle_t handle;
1925   cusparseStatus_t stat;
1926 
1927   PetscFunctionBegin;
1928   if (*trifactors) {
1929     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1930     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1931     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1932     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1933     delete (*trifactors)->rpermIndices;
1934     delete (*trifactors)->cpermIndices;
1935     delete (*trifactors)->workVector;
1936     if (handle = (*trifactors)->handle) {
1937       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1938     }
1939     delete *trifactors;
1940     *trifactors = 0;
1941   }
1942   PetscFunctionReturn(0);
1943 }
1944 
1945