xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 69cdbcb9da6ff2586d8aaad5c78ee6fa55c7fe19)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
42 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
43 
44 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
45 {
46   cusparseStatus_t   stat;
47   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
48 
49   PetscFunctionBegin;
50   cusparsestruct->stream = stream;
51   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
52   PetscFunctionReturn(0);
53 }
54 
55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
56 {
57   cusparseStatus_t   stat;
58   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
59 
60   PetscFunctionBegin;
61   if (cusparsestruct->handle != handle) {
62     if (cusparsestruct->handle) {
63       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
64     }
65     cusparsestruct->handle = handle;
66   }
67   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
68   PetscFunctionReturn(0);
69 }
70 
71 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
72 {
73   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74 
75   PetscFunctionBegin;
76   if (cusparsestruct->handle) cusparsestruct->handle = 0;
77   PetscFunctionReturn(0);
78 }
79 
80 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
81 {
82   PetscFunctionBegin;
83   *type = MATSOLVERCUSPARSE;
84   PetscFunctionReturn(0);
85 }
86 
87 /*MC
88   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
89   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
90   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
91   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
92   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
93   algorithms are not recommended. This class does NOT support direct solver operations.
94 
95   Level: beginner
96 
97 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
98 M*/
99 
100 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
101 {
102   PetscErrorCode ierr;
103   PetscInt       n = A->rmap->n;
104 
105   PetscFunctionBegin;
106   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
107   (*B)->factortype = ftype;
108   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
109   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
110 
111   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
112     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
113     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
115   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
116     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
118   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
119 
120   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
121   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
122   PetscFunctionReturn(0);
123 }
124 
125 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
126 {
127   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
128 
129   PetscFunctionBegin;
130   switch (op) {
131   case MAT_CUSPARSE_MULT:
132     cusparsestruct->format = format;
133     break;
134   case MAT_CUSPARSE_ALL:
135     cusparsestruct->format = format;
136     break;
137   default:
138     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 /*@
144    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
145    operation. Only the MatMult operation can use different GPU storage formats
146    for MPIAIJCUSPARSE matrices.
147    Not Collective
148 
149    Input Parameters:
150 +  A - Matrix of type SEQAIJCUSPARSE
151 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
152 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
153 
154    Output Parameter:
155 
156    Level: intermediate
157 
158 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
159 @*/
160 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
161 {
162   PetscErrorCode ierr;
163 
164   PetscFunctionBegin;
165   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
166   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
167   PetscFunctionReturn(0);
168 }
169 
170 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
171 {
172   PetscErrorCode           ierr;
173   MatCUSPARSEStorageFormat format;
174   PetscBool                flg;
175   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
176 
177   PetscFunctionBegin;
178   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
179   if (A->factortype==MAT_FACTOR_NONE) {
180     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
181                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182     if (flg) {
183       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
184     }
185   }
186   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
187                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
188   if (flg) {
189     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
190   }
191   ierr = PetscOptionsTail();CHKERRQ(ierr);
192   PetscFunctionReturn(0);
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248   cudaError_t                       cerr;
249 
250   PetscFunctionBegin;
251   if (!n) PetscFunctionReturn(0);
252   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353   cudaError_t                       cerr;
354 
355   PetscFunctionBegin;
356   if (!n) PetscFunctionReturn(0);
357   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358     try {
359       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360       nzUpper = adiag[0]-adiag[n];
361 
362       /* Allocate Space for the upper triangular matrix */
363       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
366 
367       /* Fill the upper triangular matrix */
368       AiUp[0]=(PetscInt) 0;
369       AiUp[n]=nzUpper;
370       offset = nzUpper;
371       for (i=n-1; i>=0; i--) {
372         v  = aa + adiag[i+1] + 1;
373         vi = aj + adiag[i+1] + 1;
374 
375         /* number of elements NOT on the diagonal */
376         nz = adiag[i] - adiag[i+1]-1;
377 
378         /* decrement the offset */
379         offset -= (nz+1);
380 
381         /* first, set the diagonal elements */
382         AjUp[offset] = (PetscInt) i;
383         AAUp[offset] = (MatScalar)1./v[nz];
384         AiUp[i]      = AiUp[i+1] - (nz+1);
385 
386         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
387         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
388       }
389 
390       /* allocate space for the triangular factor information */
391       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
392 
393       /* Create the matrix description */
394       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
399 
400       /* Create the solve analysis information */
401       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       upTriFactor->csrMat = new CsrMatrix;
408       upTriFactor->csrMat->num_rows = n;
409       upTriFactor->csrMat->num_cols = n;
410       upTriFactor->csrMat->num_entries = nzUpper;
411 
412       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
414 
415       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
417 
418       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
420 
421       /* perform the solve analysis */
422       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
426 
427       /* assign the pointer. Is this really necessary? */
428       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
429 
430       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
434     } catch(char *ex) {
435       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436     }
437   }
438   PetscFunctionReturn(0);
439 }
440 
441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442 {
443   PetscErrorCode               ierr;
444   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446   IS                           isrow = a->row,iscol = a->icol;
447   PetscBool                    row_identity,col_identity;
448   const PetscInt               *r,*c;
449   PetscInt                     n = A->rmap->n;
450 
451   PetscFunctionBegin;
452   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
453   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
454   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
455 
456   cusparseTriFactors->workVector = new THRUSTARRAY(n);
457   cusparseTriFactors->nnz=a->nz;
458 
459   A->offloadmask = PETSC_OFFLOAD_BOTH;
460   /* lower triangular indices */
461   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
462   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
463   if (!row_identity) {
464     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
465     cusparseTriFactors->rpermIndices->assign(r, r+n);
466   }
467   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
468 
469   /* upper triangular indices */
470   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
471   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
472   if (!col_identity) {
473     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
474     cusparseTriFactors->cpermIndices->assign(c, c+n);
475   }
476 
477   if (!row_identity && !col_identity) {
478     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
479   } else if(!row_identity) {
480     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
481   } else if(!col_identity) {
482     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
483   }
484 
485   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
486   PetscFunctionReturn(0);
487 }
488 
489 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
490 {
491   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
492   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
493   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
494   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
495   cusparseStatus_t                  stat;
496   PetscErrorCode                    ierr;
497   cudaError_t                       cerr;
498   PetscInt                          *AiUp, *AjUp;
499   PetscScalar                       *AAUp;
500   PetscScalar                       *AALo;
501   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
502   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
503   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
504   const MatScalar                   *aa = b->a,*v;
505 
506   PetscFunctionBegin;
507   if (!n) PetscFunctionReturn(0);
508   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
509     try {
510       /* Allocate Space for the upper triangular matrix */
511       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
512       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
513       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
514       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
515 
516       /* Fill the upper triangular matrix */
517       AiUp[0]=(PetscInt) 0;
518       AiUp[n]=nzUpper;
519       offset = 0;
520       for (i=0; i<n; i++) {
521         /* set the pointers */
522         v  = aa + ai[i];
523         vj = aj + ai[i];
524         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
525 
526         /* first, set the diagonal elements */
527         AjUp[offset] = (PetscInt) i;
528         AAUp[offset] = (MatScalar)1.0/v[nz];
529         AiUp[i]      = offset;
530         AALo[offset] = (MatScalar)1.0/v[nz];
531 
532         offset+=1;
533         if (nz>0) {
534           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
535           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
536           for (j=offset; j<offset+nz; j++) {
537             AAUp[j] = -AAUp[j];
538             AALo[j] = AAUp[j]/v[nz];
539           }
540           offset+=nz;
541         }
542       }
543 
544       /* allocate space for the triangular factor information */
545       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
546 
547       /* Create the matrix description */
548       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
549       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
550       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
551       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
552       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
553 
554       /* Create the solve analysis information */
555       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
556 
557       /* set the operation */
558       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
559 
560       /* set the matrix */
561       upTriFactor->csrMat = new CsrMatrix;
562       upTriFactor->csrMat->num_rows = A->rmap->n;
563       upTriFactor->csrMat->num_cols = A->cmap->n;
564       upTriFactor->csrMat->num_entries = a->nz;
565 
566       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
567       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
568 
569       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
570       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
571 
572       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
573       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
574 
575       /* perform the solve analysis */
576       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
577                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
578                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
579                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
580 
581       /* assign the pointer. Is this really necessary? */
582       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
583 
584       /* allocate space for the triangular factor information */
585       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
586 
587       /* Create the matrix description */
588       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
589       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
590       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
591       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
592       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
593 
594       /* Create the solve analysis information */
595       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
596 
597       /* set the operation */
598       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
599 
600       /* set the matrix */
601       loTriFactor->csrMat = new CsrMatrix;
602       loTriFactor->csrMat->num_rows = A->rmap->n;
603       loTriFactor->csrMat->num_cols = A->cmap->n;
604       loTriFactor->csrMat->num_entries = a->nz;
605 
606       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
607       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
608 
609       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
610       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
611 
612       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
613       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
614       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
615 
616       /* perform the solve analysis */
617       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
618                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
619                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
620                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
621 
622       /* assign the pointer. Is this really necessary? */
623       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
624 
625       A->offloadmask = PETSC_OFFLOAD_BOTH;
626       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
629       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
630     } catch(char *ex) {
631       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
632     }
633   }
634   PetscFunctionReturn(0);
635 }
636 
637 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
638 {
639   PetscErrorCode               ierr;
640   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
641   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
642   IS                           ip = a->row;
643   const PetscInt               *rip;
644   PetscBool                    perm_identity;
645   PetscInt                     n = A->rmap->n;
646 
647   PetscFunctionBegin;
648   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
649   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
650   cusparseTriFactors->workVector = new THRUSTARRAY(n);
651   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
652 
653   /* lower triangular indices */
654   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
655   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
656   if (!perm_identity) {
657     IS             iip;
658     const PetscInt *irip;
659 
660     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
661     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
662     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
664     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
665     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
666     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
667     ierr = ISDestroy(&iip);CHKERRQ(ierr);
668     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
669   }
670   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
675 {
676   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
677   IS             isrow = b->row,iscol = b->col;
678   PetscBool      row_identity,col_identity;
679   PetscErrorCode ierr;
680 
681   PetscFunctionBegin;
682   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
683   B->offloadmask = PETSC_OFFLOAD_CPU;
684   /* determine which version of MatSolve needs to be used. */
685   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
686   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687   if (row_identity && col_identity) {
688     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
689     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
690     B->ops->matsolve = NULL;
691     B->ops->matsolvetranspose = NULL;
692   } else {
693     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
694     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
695     B->ops->matsolve = NULL;
696     B->ops->matsolvetranspose = NULL;
697   }
698 
699   /* get the triangular factors */
700   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
701   PetscFunctionReturn(0);
702 }
703 
704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
705 {
706   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
707   IS             ip = b->row;
708   PetscBool      perm_identity;
709   PetscErrorCode ierr;
710 
711   PetscFunctionBegin;
712   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
713   B->offloadmask = PETSC_OFFLOAD_CPU;
714   /* determine which version of MatSolve needs to be used. */
715   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
716   if (perm_identity) {
717     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
718     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
719     B->ops->matsolve = NULL;
720     B->ops->matsolvetranspose = NULL;
721   } else {
722     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
723     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
724     B->ops->matsolve = NULL;
725     B->ops->matsolvetranspose = NULL;
726   }
727 
728   /* get the triangular factors */
729   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
734 {
735   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
736   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
737   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
738   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
739   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
740   cusparseStatus_t                  stat;
741   cusparseIndexBase_t               indexBase;
742   cusparseMatrixType_t              matrixType;
743   cusparseFillMode_t                fillMode;
744   cusparseDiagType_t                diagType;
745 
746   PetscFunctionBegin;
747 
748   /*********************************************/
749   /* Now the Transpose of the Lower Tri Factor */
750   /*********************************************/
751 
752   /* allocate space for the transpose of the lower triangular factor */
753   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
754 
755   /* set the matrix descriptors of the lower triangular factor */
756   matrixType = cusparseGetMatType(loTriFactor->descr);
757   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
758   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
759     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
760   diagType = cusparseGetMatDiagType(loTriFactor->descr);
761 
762   /* Create the matrix description */
763   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
764   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
765   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
766   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
767   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
768 
769   /* Create the solve analysis information */
770   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
771 
772   /* set the operation */
773   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
774 
775   /* allocate GPU space for the CSC of the lower triangular factor*/
776   loTriFactorT->csrMat = new CsrMatrix;
777   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
778   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
779   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
780   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
781   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
782   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
783 
784   /* compute the transpose of the lower triangular factor, i.e. the CSC */
785   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
786                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
787                           loTriFactor->csrMat->values->data().get(),
788                           loTriFactor->csrMat->row_offsets->data().get(),
789                           loTriFactor->csrMat->column_indices->data().get(),
790                           loTriFactorT->csrMat->values->data().get(),
791                           loTriFactorT->csrMat->column_indices->data().get(),
792                           loTriFactorT->csrMat->row_offsets->data().get(),
793                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
794 
795   /* perform the solve analysis on the transposed matrix */
796   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
797                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
798                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
799                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
800                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
801 
802   /* assign the pointer. Is this really necessary? */
803   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
804 
805   /*********************************************/
806   /* Now the Transpose of the Upper Tri Factor */
807   /*********************************************/
808 
809   /* allocate space for the transpose of the upper triangular factor */
810   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
811 
812   /* set the matrix descriptors of the upper triangular factor */
813   matrixType = cusparseGetMatType(upTriFactor->descr);
814   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
815   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
816     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
817   diagType = cusparseGetMatDiagType(upTriFactor->descr);
818 
819   /* Create the matrix description */
820   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
821   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
822   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
823   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
824   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
825 
826   /* Create the solve analysis information */
827   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
828 
829   /* set the operation */
830   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
831 
832   /* allocate GPU space for the CSC of the upper triangular factor*/
833   upTriFactorT->csrMat = new CsrMatrix;
834   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
835   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
836   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
837   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
838   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
839   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
840 
841   /* compute the transpose of the upper triangular factor, i.e. the CSC */
842   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
843                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
844                           upTriFactor->csrMat->values->data().get(),
845                           upTriFactor->csrMat->row_offsets->data().get(),
846                           upTriFactor->csrMat->column_indices->data().get(),
847                           upTriFactorT->csrMat->values->data().get(),
848                           upTriFactorT->csrMat->column_indices->data().get(),
849                           upTriFactorT->csrMat->row_offsets->data().get(),
850                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
851 
852   /* perform the solve analysis on the transposed matrix */
853   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
854                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
855                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
856                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
857                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
858 
859   /* assign the pointer. Is this really necessary? */
860   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
861   PetscFunctionReturn(0);
862 }
863 
864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
865 {
866   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
867   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
868   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
869   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
870   cusparseStatus_t             stat;
871   cusparseIndexBase_t          indexBase;
872   cudaError_t                  err;
873   PetscErrorCode               ierr;
874 
875   PetscFunctionBegin;
876 
877   /* allocate space for the triangular factor information */
878   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
879   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
880   indexBase = cusparseGetMatIndexBase(matstruct->descr);
881   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
882   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
883 
884   /* set alpha and beta */
885   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
886   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
887   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
888   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
889   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
890   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
891   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
892 
893   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
894     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
895     CsrMatrix *matrixT= new CsrMatrix;
896     matrixT->num_rows = A->cmap->n;
897     matrixT->num_cols = A->rmap->n;
898     matrixT->num_entries = a->nz;
899     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
900     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
901     matrixT->values = new THRUSTARRAY(a->nz);
902 
903     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
904     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
905     /* compute the transpose, i.e. the CSC */
906     indexBase = cusparseGetMatIndexBase(matstruct->descr);
907     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
908                             A->cmap->n, matrix->num_entries,
909                             matrix->values->data().get(),
910                             cusparsestruct->rowoffsets_gpu->data().get(),
911                             matrix->column_indices->data().get(),
912                             matrixT->values->data().get(),
913                             matrixT->column_indices->data().get(),
914                             matrixT->row_offsets->data().get(),
915                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
916     /* assign the pointer */
917     matstructT->mat = matrixT;
918     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
919   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
920     /* First convert HYB to CSR */
921     CsrMatrix *temp= new CsrMatrix;
922     temp->num_rows = A->rmap->n;
923     temp->num_cols = A->cmap->n;
924     temp->num_entries = a->nz;
925     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
926     temp->column_indices = new THRUSTINTARRAY32(a->nz);
927     temp->values = new THRUSTARRAY(a->nz);
928 
929 
930     stat = cusparse_hyb2csr(cusparsestruct->handle,
931                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
932                             temp->values->data().get(),
933                             temp->row_offsets->data().get(),
934                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
935 
936     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
937     CsrMatrix *tempT= new CsrMatrix;
938     tempT->num_rows = A->rmap->n;
939     tempT->num_cols = A->cmap->n;
940     tempT->num_entries = a->nz;
941     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
942     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
943     tempT->values = new THRUSTARRAY(a->nz);
944 
945     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
946                             temp->num_cols, temp->num_entries,
947                             temp->values->data().get(),
948                             temp->row_offsets->data().get(),
949                             temp->column_indices->data().get(),
950                             tempT->values->data().get(),
951                             tempT->column_indices->data().get(),
952                             tempT->row_offsets->data().get(),
953                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
954 
955     /* Last, convert CSC to HYB */
956     cusparseHybMat_t hybMat;
957     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
958     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
959       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
960     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
961                             matstructT->descr, tempT->values->data().get(),
962                             tempT->row_offsets->data().get(),
963                             tempT->column_indices->data().get(),
964                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
965 
966     /* assign the pointer */
967     matstructT->mat = hybMat;
968     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
969 
970     /* delete temporaries */
971     if (tempT) {
972       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
973       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
974       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
975       delete (CsrMatrix*) tempT;
976     }
977     if (temp) {
978       if (temp->values) delete (THRUSTARRAY*) temp->values;
979       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
980       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
981       delete (CsrMatrix*) temp;
982     }
983   }
984   /* the compressed row indices is not used for matTranspose */
985   matstructT->cprowIndices = NULL;
986   /* assign the pointer */
987   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
988   PetscFunctionReturn(0);
989 }
990 
991 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
992 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
993 {
994   PetscInt                              n = xx->map->n;
995   const PetscScalar                     *barray;
996   PetscScalar                           *xarray;
997   thrust::device_ptr<const PetscScalar> bGPU;
998   thrust::device_ptr<PetscScalar>       xGPU;
999   cusparseStatus_t                      stat;
1000   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1002   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1003   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1004   PetscErrorCode                        ierr;
1005   cudaError_t                           cerr;
1006 
1007   PetscFunctionBegin;
1008   /* Analyze the matrix and create the transpose ... on the fly */
1009   if (!loTriFactorT && !upTriFactorT) {
1010     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1011     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1012     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1013   }
1014 
1015   /* Get the GPU pointers */
1016   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1017   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1018   xGPU = thrust::device_pointer_cast(xarray);
1019   bGPU = thrust::device_pointer_cast(barray);
1020 
1021   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1022   /* First, reorder with the row permutation */
1023   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1024                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1025                xGPU);
1026 
1027   /* First, solve U */
1028   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1029                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1030                         upTriFactorT->csrMat->values->data().get(),
1031                         upTriFactorT->csrMat->row_offsets->data().get(),
1032                         upTriFactorT->csrMat->column_indices->data().get(),
1033                         upTriFactorT->solveInfo,
1034                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1035 
1036   /* Then, solve L */
1037   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1038                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1039                         loTriFactorT->csrMat->values->data().get(),
1040                         loTriFactorT->csrMat->row_offsets->data().get(),
1041                         loTriFactorT->csrMat->column_indices->data().get(),
1042                         loTriFactorT->solveInfo,
1043                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1044 
1045   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1046   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1047                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1048                tempGPU->begin());
1049 
1050   /* Copy the temporary to the full solution. */
1051   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1052 
1053   /* restore */
1054   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1055   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1056   cerr = WaitForGPU();CHKERRCUDA(cerr);
1057   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1058   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1063 {
1064   const PetscScalar                 *barray;
1065   PetscScalar                       *xarray;
1066   cusparseStatus_t                  stat;
1067   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1068   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1070   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1071   PetscErrorCode                    ierr;
1072   cudaError_t                       cerr;
1073 
1074   PetscFunctionBegin;
1075   /* Analyze the matrix and create the transpose ... on the fly */
1076   if (!loTriFactorT && !upTriFactorT) {
1077     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1078     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080   }
1081 
1082   /* Get the GPU pointers */
1083   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1084   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1085 
1086   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1087   /* First, solve U */
1088   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1089                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1090                         upTriFactorT->csrMat->values->data().get(),
1091                         upTriFactorT->csrMat->row_offsets->data().get(),
1092                         upTriFactorT->csrMat->column_indices->data().get(),
1093                         upTriFactorT->solveInfo,
1094                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1095 
1096   /* Then, solve L */
1097   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1098                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1099                         loTriFactorT->csrMat->values->data().get(),
1100                         loTriFactorT->csrMat->row_offsets->data().get(),
1101                         loTriFactorT->csrMat->column_indices->data().get(),
1102                         loTriFactorT->solveInfo,
1103                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1104 
1105   /* restore */
1106   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1107   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1108   cerr = WaitForGPU();CHKERRCUDA(cerr);
1109   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1110   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1115 {
1116   const PetscScalar                     *barray;
1117   PetscScalar                           *xarray;
1118   thrust::device_ptr<const PetscScalar> bGPU;
1119   thrust::device_ptr<PetscScalar>       xGPU;
1120   cusparseStatus_t                      stat;
1121   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1122   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1123   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1124   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1125   PetscErrorCode                        ierr;
1126   cudaError_t                           cerr;
1127 
1128   PetscFunctionBegin;
1129 
1130   /* Get the GPU pointers */
1131   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1132   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1133   xGPU = thrust::device_pointer_cast(xarray);
1134   bGPU = thrust::device_pointer_cast(barray);
1135 
1136   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1137   /* First, reorder with the row permutation */
1138   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1139                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1140                tempGPU->begin());
1141 
1142   /* Next, solve L */
1143   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1144                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1145                         loTriFactor->csrMat->values->data().get(),
1146                         loTriFactor->csrMat->row_offsets->data().get(),
1147                         loTriFactor->csrMat->column_indices->data().get(),
1148                         loTriFactor->solveInfo,
1149                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1150 
1151   /* Then, solve U */
1152   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1153                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1154                         upTriFactor->csrMat->values->data().get(),
1155                         upTriFactor->csrMat->row_offsets->data().get(),
1156                         upTriFactor->csrMat->column_indices->data().get(),
1157                         upTriFactor->solveInfo,
1158                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1159 
1160   /* Last, reorder with the column permutation */
1161   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1162                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1163                xGPU);
1164 
1165   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1166   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1167   cerr = WaitForGPU();CHKERRCUDA(cerr);
1168   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1169   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1174 {
1175   const PetscScalar                 *barray;
1176   PetscScalar                       *xarray;
1177   cusparseStatus_t                  stat;
1178   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1179   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1180   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1181   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1182   PetscErrorCode                    ierr;
1183   cudaError_t                       cerr;
1184 
1185   PetscFunctionBegin;
1186   /* Get the GPU pointers */
1187   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1188   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1189 
1190   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1191   /* First, solve L */
1192   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1193                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1194                         loTriFactor->csrMat->values->data().get(),
1195                         loTriFactor->csrMat->row_offsets->data().get(),
1196                         loTriFactor->csrMat->column_indices->data().get(),
1197                         loTriFactor->solveInfo,
1198                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1199 
1200   /* Next, solve U */
1201   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1202                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1203                         upTriFactor->csrMat->values->data().get(),
1204                         upTriFactor->csrMat->row_offsets->data().get(),
1205                         upTriFactor->csrMat->column_indices->data().get(),
1206                         upTriFactor->solveInfo,
1207                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1208 
1209   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1210   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1211   cerr = WaitForGPU();CHKERRCUDA(cerr);
1212   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1213   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1218 {
1219   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1220   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1221   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1222   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1223   PetscErrorCode               ierr;
1224   cusparseStatus_t             stat;
1225   cudaError_t                  err;
1226 
1227   PetscFunctionBegin;
1228   if (A->boundtocpu) PetscFunctionReturn(0);
1229   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1230     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1231     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232       /* Copy values only */
1233       CsrMatrix *mat,*matT;
1234       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1235       mat->values->assign(a->a, a->a+a->nz);
1236       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1237 
1238       /* Update matT when it was built before */
1239       if (cusparsestruct->matTranspose) {
1240         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1241         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1242         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1243                                  A->cmap->n, mat->num_entries,
1244                                  mat->values->data().get(),
1245                                  cusparsestruct->rowoffsets_gpu->data().get(),
1246                                  mat->column_indices->data().get(),
1247                                  matT->values->data().get(),
1248                                  matT->column_indices->data().get(),
1249                                  matT->row_offsets->data().get(),
1250                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1251       }
1252     } else {
1253       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1254       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1255       delete cusparsestruct->workVector;
1256       delete cusparsestruct->rowoffsets_gpu;
1257       try {
1258         if (a->compressedrow.use) {
1259           m    = a->compressedrow.nrows;
1260           ii   = a->compressedrow.i;
1261           ridx = a->compressedrow.rindex;
1262         } else {
1263           m    = A->rmap->n;
1264           ii   = a->i;
1265           ridx = NULL; /* In this case, one should not dereference ridx */
1266         }
1267         cusparsestruct->nrows = m;
1268 
1269         /* allocate space for the triangular factor information */
1270         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1271         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1272         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1273         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1274 
1275         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1276         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1277         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1278         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1279         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1280         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1281         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1282 
1283         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1284         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1285           /* set the matrix */
1286           CsrMatrix *matrix= new CsrMatrix;
1287           matrix->num_rows = m;
1288           matrix->num_cols = A->cmap->n;
1289           matrix->num_entries = a->nz;
1290           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1291           matrix->row_offsets->assign(ii, ii + m+1);
1292 
1293           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1294           matrix->column_indices->assign(a->j, a->j+a->nz);
1295 
1296           matrix->values = new THRUSTARRAY(a->nz);
1297           matrix->values->assign(a->a, a->a+a->nz);
1298 
1299           /* assign the pointer */
1300           matstruct->mat = matrix;
1301 
1302         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1303           CsrMatrix *matrix= new CsrMatrix;
1304           matrix->num_rows = m;
1305           matrix->num_cols = A->cmap->n;
1306           matrix->num_entries = a->nz;
1307           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1308           matrix->row_offsets->assign(ii, ii + m+1);
1309 
1310           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1311           matrix->column_indices->assign(a->j, a->j+a->nz);
1312 
1313           matrix->values = new THRUSTARRAY(a->nz);
1314           matrix->values->assign(a->a, a->a+a->nz);
1315 
1316           cusparseHybMat_t hybMat;
1317           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1318           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1319             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1320           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1321               matstruct->descr, matrix->values->data().get(),
1322               matrix->row_offsets->data().get(),
1323               matrix->column_indices->data().get(),
1324               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1325           /* assign the pointer */
1326           matstruct->mat = hybMat;
1327 
1328           if (matrix) {
1329             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1330             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1331             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1332             delete (CsrMatrix*)matrix;
1333           }
1334         }
1335 
1336         /* assign the compressed row indices */
1337         if (a->compressedrow.use) {
1338           cusparsestruct->workVector = new THRUSTARRAY(m);
1339           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1340           matstruct->cprowIndices->assign(ridx,ridx+m);
1341           tmp = m;
1342         } else {
1343           cusparsestruct->workVector = NULL;
1344           matstruct->cprowIndices    = NULL;
1345           tmp = 0;
1346         }
1347         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1348 
1349         /* assign the pointer */
1350         cusparsestruct->mat = matstruct;
1351       } catch(char *ex) {
1352         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1353       }
1354       cusparsestruct->nonzerostate = A->nonzerostate;
1355     }
1356     err  = WaitForGPU();CHKERRCUDA(err);
1357     A->offloadmask = PETSC_OFFLOAD_BOTH;
1358     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1359   }
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 struct VecCUDAPlusEquals
1364 {
1365   template <typename Tuple>
1366   __host__ __device__
1367   void operator()(Tuple t)
1368   {
1369     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1370   }
1371 };
1372 
1373 typedef struct {
1374   PetscBool   cisdense;
1375   PetscScalar *Bt;
1376   Mat         X;
1377 } MatMatCusparse;
1378 
1379 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1380 {
1381   PetscErrorCode ierr;
1382   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1383   cudaError_t    cerr;
1384 
1385   PetscFunctionBegin;
1386   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1387   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1388   ierr = PetscFree(data);CHKERRQ(ierr);
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1393 
1394 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1395 {
1396   Mat_Product                  *product = C->product;
1397   Mat                          A,B;
1398   PetscInt                     m,n,k,blda,clda;
1399   PetscBool                    flg,biscuda;
1400   Mat_SeqAIJCUSPARSE           *cusp;
1401   cusparseStatus_t             stat;
1402   cusparseOperation_t          opA;
1403   cusparseMatDescr_t           matA;
1404   const PetscScalar            *barray;
1405   PetscScalar                  *carray;
1406   PetscErrorCode               ierr;
1407   MatMatCusparse               *mmdata;
1408   Mat_SeqAIJCUSPARSEMultStruct *mat;
1409   CsrMatrix                    *csrmat;
1410 
1411   PetscFunctionBegin;
1412   MatCheckProduct(C,1);
1413   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1414   mmdata = (MatMatCusparse*)product->data;
1415   A    = product->A;
1416   B    = product->B;
1417   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1418   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1419   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1420      Instead of silently accepting the wrong answer, I prefer to raise the error */
1421   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1422   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1423   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1424   switch (product->type) {
1425   case MATPRODUCT_AB:
1426   case MATPRODUCT_PtAP:
1427     mat = cusp->mat;
1428     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1429     m   = A->rmap->n;
1430     k   = A->cmap->n;
1431     n   = B->cmap->n;
1432     break;
1433   case MATPRODUCT_AtB:
1434     if (!cusp->matTranspose) {
1435       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1436     }
1437     mat = cusp->matTranspose;
1438     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1439     m   = A->cmap->n;
1440     k   = A->rmap->n;
1441     n   = B->cmap->n;
1442     break;
1443   case MATPRODUCT_ABt:
1444   case MATPRODUCT_RARt:
1445     mat = cusp->mat;
1446     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1447     m   = A->rmap->n;
1448     k   = B->cmap->n;
1449     n   = B->rmap->n;
1450     break;
1451   default:
1452     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1453   }
1454   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1455   matA   = mat->descr;
1456   csrmat = (CsrMatrix*)mat->mat;
1457   /* if the user passed a CPU matrix, copy the data to the GPU */
1458   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1459   if (!biscuda) {
1460     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1461   }
1462   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1463   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1464   /* cusparse spmm does not support transpose on B */
1465   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1466     cublasHandle_t cublasv2handle;
1467     cublasStatus_t cerr;
1468 
1469     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1470     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1471                        B->cmap->n,B->rmap->n,
1472                        &PETSC_CUSPARSE_ONE ,barray,blda,
1473                        &PETSC_CUSPARSE_ZERO,barray,blda,
1474                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1475     blda = B->cmap->n;
1476   }
1477   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1478     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1479     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1480   } else {
1481     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1482     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1483   }
1484 
1485   /* perform the MatMat operation */
1486   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1487                            csrmat->num_entries,mat->alpha,matA,
1488                            csrmat->values->data().get(),
1489                            csrmat->row_offsets->data().get(),
1490                            csrmat->column_indices->data().get(),
1491                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1492                            carray,clda);CHKERRCUSPARSE(stat);
1493 
1494   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1495   if (product->type == MATPRODUCT_RARt) {
1496     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1497     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1498   } else if (product->type == MATPRODUCT_PtAP) {
1499     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1500     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1501   } else {
1502     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1503   }
1504   if (mmdata->cisdense) {
1505     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1506   }
1507   if (!biscuda) {
1508     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1509   }
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1514 {
1515   Mat_Product        *product = C->product;
1516   Mat                A,B;
1517   PetscInt           m,n;
1518   PetscBool          cisdense,flg;
1519   PetscErrorCode     ierr;
1520   MatMatCusparse     *mmdata;
1521   Mat_SeqAIJCUSPARSE *cusp;
1522 
1523   PetscFunctionBegin;
1524   MatCheckProduct(C,1);
1525   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1526   A    = product->A;
1527   B    = product->B;
1528   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1529   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1530   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1531   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1532   switch (product->type) {
1533   case MATPRODUCT_AB:
1534     m = A->rmap->n;
1535     n = B->cmap->n;
1536     break;
1537   case MATPRODUCT_AtB:
1538     m = A->cmap->n;
1539     n = B->cmap->n;
1540     break;
1541   case MATPRODUCT_ABt:
1542     m = A->rmap->n;
1543     n = B->rmap->n;
1544     break;
1545   case MATPRODUCT_PtAP:
1546     m = B->cmap->n;
1547     n = B->cmap->n;
1548     break;
1549   case MATPRODUCT_RARt:
1550     m = B->rmap->n;
1551     n = B->rmap->n;
1552     break;
1553   default:
1554     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1555   }
1556   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1557   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1558   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1559   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1560 
1561   /* product data */
1562   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1563   mmdata->cisdense = cisdense;
1564   /* cusparse spmm does not support transpose on B */
1565   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1566     cudaError_t cerr;
1567 
1568     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1569   }
1570   /* for these products we need intermediate storage */
1571   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1572     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1573     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1574     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1575       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1576     } else {
1577       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1578     }
1579   }
1580   C->product->data    = mmdata;
1581   C->product->destroy = MatDestroy_MatMatCusparse;
1582 
1583   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1588 
1589 /* handles dense B */
1590 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1591 {
1592   Mat_Product    *product = C->product;
1593   PetscErrorCode ierr;
1594 
1595   PetscFunctionBegin;
1596   MatCheckProduct(C,1);
1597   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1598   if (product->A->boundtocpu) {
1599     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1600     PetscFunctionReturn(0);
1601   }
1602   switch (product->type) {
1603   case MATPRODUCT_AB:
1604   case MATPRODUCT_AtB:
1605   case MATPRODUCT_ABt:
1606   case MATPRODUCT_PtAP:
1607   case MATPRODUCT_RARt:
1608     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1609   default:
1610     break;
1611   }
1612   PetscFunctionReturn(0);
1613 }
1614 
1615 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1616 {
1617   PetscErrorCode ierr;
1618 
1619   PetscFunctionBegin;
1620   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1625 {
1626   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1627   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1628   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1629   const PetscScalar            *xarray;
1630   PetscScalar                  *yarray;
1631   PetscErrorCode               ierr;
1632   cudaError_t                  cerr;
1633   cusparseStatus_t             stat;
1634 
1635   PetscFunctionBegin;
1636   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1637   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1638   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1639   if (!matstructT) {
1640     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1641     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1642   }
1643   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1644   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1645   if (yy->map->n) {
1646     PetscInt                     n = yy->map->n;
1647     cudaError_t                  err;
1648     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1649   }
1650 
1651   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1652   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1653     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1654     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1655                              mat->num_rows, mat->num_cols,
1656                              mat->num_entries, matstructT->alpha, matstructT->descr,
1657                              mat->values->data().get(), mat->row_offsets->data().get(),
1658                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1659                              yarray);CHKERRCUSPARSE(stat);
1660   } else {
1661     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1662     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1663                              matstructT->alpha, matstructT->descr, hybMat,
1664                              xarray, matstructT->beta_zero,
1665                              yarray);CHKERRCUSPARSE(stat);
1666   }
1667   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1668   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1669   cerr = WaitForGPU();CHKERRCUDA(cerr);
1670   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1671   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1672   PetscFunctionReturn(0);
1673 }
1674 
1675 
1676 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1677 {
1678   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1679   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1680   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1681   const PetscScalar            *xarray;
1682   PetscScalar                  *zarray,*dptr,*beta;
1683   PetscErrorCode               ierr;
1684   cudaError_t                  cerr;
1685   cusparseStatus_t             stat;
1686   PetscBool                    compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1687 
1688   PetscFunctionBegin;
1689   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1690   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1691   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1692   try {
1693     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1694 
1695     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1696     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1697 
1698     dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */
1699     beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1700 
1701     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1702     /* csr_spmv does y = alpha*Ax + beta*y */
1703     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1704       /* here we need to be careful to set the number of rows in the multiply to the
1705          number of compressed rows in the matrix ... which is equivalent to the
1706          size of the workVector */
1707       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1708       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1709                                mat->num_rows, mat->num_cols,
1710                                mat->num_entries, matstruct->alpha, matstruct->descr,
1711                                mat->values->data().get(), mat->row_offsets->data().get(),
1712                                mat->column_indices->data().get(), xarray, beta,
1713                                dptr);CHKERRCUSPARSE(stat);
1714     } else {
1715       if (cusparsestruct->nrows) {
1716         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1717         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1718                                  matstruct->alpha, matstruct->descr, hybMat,
1719                                  xarray, beta,
1720                                  dptr);CHKERRCUSPARSE(stat);
1721       }
1722     }
1723     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1724 
1725     if (yy) { /* MatMultAdd: zz = A*xx + yy */
1726       if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1727         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1728       } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */
1729         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1730       }
1731     } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1732       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1733     }
1734 
1735     /* ScatterAdd the result from work vector into the full vector when A is compressed */
1736     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1737     if (compressed) {
1738       thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1739       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1740                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1741                        VecCUDAPlusEquals());
1742     }
1743     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1744     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1745     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1746     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1747   } catch(char *ex) {
1748     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1749   }
1750   cerr = WaitForGPU();CHKERRCUDA(cerr);
1751   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1756 {
1757   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1758   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1759   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1760   const PetscScalar               *xarray;
1761   PetscScalar                     *zarray,*beta;
1762   PetscErrorCode                  ierr;
1763   cudaError_t                     cerr;
1764   cusparseStatus_t                stat;
1765 
1766   PetscFunctionBegin;
1767   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1768   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1769   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1770   if (!matstructT) {
1771     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1772     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1773   }
1774 
1775   /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1776   try {
1777     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1778     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1779     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1780     beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;
1781 
1782     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1783     /* multiply add with matrix transpose */
1784     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1785       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1786       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1787                                mat->num_rows, mat->num_cols,
1788                                mat->num_entries, matstructT->alpha, matstructT->descr,
1789                                mat->values->data().get(), mat->row_offsets->data().get(),
1790                                mat->column_indices->data().get(), xarray, beta,
1791                                zarray);CHKERRCUSPARSE(stat);
1792     } else {
1793       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1794       if (cusparsestruct->nrows) {
1795         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1796                                  matstructT->alpha, matstructT->descr, hybMat,
1797                                  xarray, beta,
1798                                  zarray);CHKERRCUSPARSE(stat);
1799       }
1800     }
1801     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1802 
1803     if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);}
1804     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1805     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1806   } catch(char *ex) {
1807     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1808   }
1809   cerr = WaitForGPU();CHKERRCUDA(cerr);
1810   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1811   PetscFunctionReturn(0);
1812 }
1813 
1814 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1815 {
1816   PetscErrorCode ierr;
1817 
1818   PetscFunctionBegin;
1819   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1820   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1821   if (A->factortype == MAT_FACTOR_NONE) {
1822     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1823   }
1824   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1825   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1826   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1827   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 /* --------------------------------------------------------------------------------*/
1832 /*@
1833    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1834    (the default parallel PETSc format). This matrix will ultimately pushed down
1835    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1836    assembly performance the user should preallocate the matrix storage by setting
1837    the parameter nz (or the array nnz).  By setting these parameters accurately,
1838    performance during matrix assembly can be increased by more than a factor of 50.
1839 
1840    Collective
1841 
1842    Input Parameters:
1843 +  comm - MPI communicator, set to PETSC_COMM_SELF
1844 .  m - number of rows
1845 .  n - number of columns
1846 .  nz - number of nonzeros per row (same for all rows)
1847 -  nnz - array containing the number of nonzeros in the various rows
1848          (possibly different for each row) or NULL
1849 
1850    Output Parameter:
1851 .  A - the matrix
1852 
1853    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1854    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1855    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1856 
1857    Notes:
1858    If nnz is given then nz is ignored
1859 
1860    The AIJ format (also called the Yale sparse matrix format or
1861    compressed row storage), is fully compatible with standard Fortran 77
1862    storage.  That is, the stored row and column indices can begin at
1863    either one (as in Fortran) or zero.  See the users' manual for details.
1864 
1865    Specify the preallocated storage with either nz or nnz (not both).
1866    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1867    allocation.  For large problems you MUST preallocate memory or you
1868    will get TERRIBLE performance, see the users' manual chapter on matrices.
1869 
1870    By default, this format uses inodes (identical nodes) when possible, to
1871    improve numerical efficiency of matrix-vector products and solves. We
1872    search for consecutive rows with the same nonzero structure, thereby
1873    reusing matrix information to achieve increased efficiency.
1874 
1875    Level: intermediate
1876 
1877 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1878 @*/
1879 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1880 {
1881   PetscErrorCode ierr;
1882 
1883   PetscFunctionBegin;
1884   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1885   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1886   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1887   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1888   PetscFunctionReturn(0);
1889 }
1890 
1891 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1892 {
1893   PetscErrorCode ierr;
1894 
1895   PetscFunctionBegin;
1896   if (A->factortype == MAT_FACTOR_NONE) {
1897     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1898       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1899     }
1900   } else {
1901     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1902   }
1903   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1904   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1905   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1906   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1907   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1908   PetscFunctionReturn(0);
1909 }
1910 
1911 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1912 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1913 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1914 {
1915   PetscErrorCode ierr;
1916 
1917   PetscFunctionBegin;
1918   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1919   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1920   PetscFunctionReturn(0);
1921 }
1922 
1923 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1924 {
1925   PetscFunctionBegin;
1926   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1927      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1928      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1929      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1930            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1931   /* We need to take care of function composition also */
1932   if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
1933   if (flg) {
1934     A->ops->mult             = MatMult_SeqAIJ;
1935     A->ops->multadd          = MatMultAdd_SeqAIJ;
1936     A->ops->multtranspose    = MatMultTranspose_SeqAIJ;
1937     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1938     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
1939     A->ops->duplicate        = MatDuplicate_SeqAIJ;
1940   } else {
1941     A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1942     A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1943     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1944     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1945     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1946     A->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1947     A->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1948   }
1949   A->boundtocpu = flg;
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
1954 {
1955   PetscErrorCode ierr;
1956   cusparseStatus_t stat;
1957   cusparseHandle_t handle=0;
1958 
1959   PetscFunctionBegin;
1960   if (reuse != MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet supported");
1961   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1962   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1963 
1964   if (B->factortype == MAT_FACTOR_NONE) {
1965     /* you cannot check the inode.use flag here since the matrix was just created.
1966        now build a GPU matrix data structure */
1967     B->spptr = new Mat_SeqAIJCUSPARSE;
1968     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1969     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1970     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1971     ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1972     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1973     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1974     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1975     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1976     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1977   } else {
1978     /* NEXT, set the pointers to the triangular factors */
1979     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1980     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1981     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1982     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1983     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1984     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1985     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1986     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1987     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1988     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1989     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1990   }
1991 
1992   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1993   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1994   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1995   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1996   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1997   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1998   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1999   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
2000   B->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
2001 
2002   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2003 
2004   B->boundtocpu  = PETSC_FALSE;
2005   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2006 
2007   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2008   ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2009   ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2010   PetscFunctionReturn(0);
2011 }
2012 
2013 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2014 {
2015   PetscErrorCode ierr;
2016 
2017   PetscFunctionBegin;
2018   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2019   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2020   PetscFunctionReturn(0);
2021 }
2022 
2023 /*MC
2024    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2025 
2026    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2027    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2028    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2029 
2030    Options Database Keys:
2031 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2032 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2033 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2034 
2035   Level: beginner
2036 
2037 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2038 M*/
2039 
2040 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2041 
2042 
2043 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2044 {
2045   PetscErrorCode ierr;
2046 
2047   PetscFunctionBegin;
2048   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2049   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2050   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2051   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2052   PetscFunctionReturn(0);
2053 }
2054 
2055 
2056 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2057 {
2058   cusparseStatus_t stat;
2059   cusparseHandle_t handle;
2060 
2061   PetscFunctionBegin;
2062   if (*cusparsestruct) {
2063     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2064     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2065     delete (*cusparsestruct)->workVector;
2066     delete (*cusparsestruct)->rowoffsets_gpu;
2067     if (handle = (*cusparsestruct)->handle) {
2068       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2069     }
2070     delete *cusparsestruct;
2071     *cusparsestruct = 0;
2072   }
2073   PetscFunctionReturn(0);
2074 }
2075 
2076 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2077 {
2078   PetscFunctionBegin;
2079   if (*mat) {
2080     delete (*mat)->values;
2081     delete (*mat)->column_indices;
2082     delete (*mat)->row_offsets;
2083     delete *mat;
2084     *mat = 0;
2085   }
2086   PetscFunctionReturn(0);
2087 }
2088 
2089 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2090 {
2091   cusparseStatus_t stat;
2092   PetscErrorCode   ierr;
2093 
2094   PetscFunctionBegin;
2095   if (*trifactor) {
2096     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2097     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2098     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2099     delete *trifactor;
2100     *trifactor = 0;
2101   }
2102   PetscFunctionReturn(0);
2103 }
2104 
2105 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2106 {
2107   CsrMatrix        *mat;
2108   cusparseStatus_t stat;
2109   cudaError_t      err;
2110 
2111   PetscFunctionBegin;
2112   if (*matstruct) {
2113     if ((*matstruct)->mat) {
2114       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2115         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2116         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2117       } else {
2118         mat = (CsrMatrix*)(*matstruct)->mat;
2119         CsrMatrix_Destroy(&mat);
2120       }
2121     }
2122     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2123     delete (*matstruct)->cprowIndices;
2124     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2125     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2126     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2127     delete *matstruct;
2128     *matstruct = 0;
2129   }
2130   PetscFunctionReturn(0);
2131 }
2132 
2133 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2134 {
2135   PetscFunctionBegin;
2136   if (*trifactors) {
2137     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2138     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2139     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2140     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2141     delete (*trifactors)->rpermIndices;
2142     delete (*trifactors)->cpermIndices;
2143     delete (*trifactors)->workVector;
2144     (*trifactors)->rpermIndices = 0;
2145     (*trifactors)->cpermIndices = 0;
2146     (*trifactors)->workVector = 0;
2147   }
2148   PetscFunctionReturn(0);
2149 }
2150 
2151 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2152 {
2153   cusparseHandle_t handle;
2154   cusparseStatus_t stat;
2155 
2156   PetscFunctionBegin;
2157   if (*trifactors) {
2158     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2159     if (handle = (*trifactors)->handle) {
2160       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2161     }
2162     delete *trifactors;
2163     *trifactors = 0;
2164   }
2165   PetscFunctionReturn(0);
2166 }
2167 
2168