xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 117ef88edefbfc12e7c19efe87a19a2e1b0acd4f)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
42 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
43 
44 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
45 {
46   cusparseStatus_t   stat;
47   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
48 
49   PetscFunctionBegin;
50   cusparsestruct->stream = stream;
51   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
52   PetscFunctionReturn(0);
53 }
54 
55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
56 {
57   cusparseStatus_t   stat;
58   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
59 
60   PetscFunctionBegin;
61   if (cusparsestruct->handle != handle) {
62     if (cusparsestruct->handle) {
63       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
64     }
65     cusparsestruct->handle = handle;
66   }
67   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
68   PetscFunctionReturn(0);
69 }
70 
71 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
72 {
73   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74 
75   PetscFunctionBegin;
76   if (cusparsestruct->handle) cusparsestruct->handle = 0;
77   PetscFunctionReturn(0);
78 }
79 
80 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
81 {
82   PetscFunctionBegin;
83   *type = MATSOLVERCUSPARSE;
84   PetscFunctionReturn(0);
85 }
86 
87 /*MC
88   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
89   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
90   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
91   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
92   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
93   algorithms are not recommended. This class does NOT support direct solver operations.
94 
95   Level: beginner
96 
97 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
98 M*/
99 
100 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
101 {
102   PetscErrorCode ierr;
103   PetscInt       n = A->rmap->n;
104 
105   PetscFunctionBegin;
106   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
107   (*B)->factortype = ftype;
108   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
109   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
110 
111   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
112     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
113     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
115   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
116     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
118   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
119 
120   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
121   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
122   PetscFunctionReturn(0);
123 }
124 
125 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
126 {
127   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
128 
129   PetscFunctionBegin;
130   switch (op) {
131   case MAT_CUSPARSE_MULT:
132     cusparsestruct->format = format;
133     break;
134   case MAT_CUSPARSE_ALL:
135     cusparsestruct->format = format;
136     break;
137   default:
138     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 /*@
144    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
145    operation. Only the MatMult operation can use different GPU storage formats
146    for MPIAIJCUSPARSE matrices.
147    Not Collective
148 
149    Input Parameters:
150 +  A - Matrix of type SEQAIJCUSPARSE
151 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
152 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
153 
154    Output Parameter:
155 
156    Level: intermediate
157 
158 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
159 @*/
160 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
161 {
162   PetscErrorCode ierr;
163 
164   PetscFunctionBegin;
165   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
166   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
167   PetscFunctionReturn(0);
168 }
169 
170 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
171 {
172   PetscErrorCode           ierr;
173   MatCUSPARSEStorageFormat format;
174   PetscBool                flg;
175   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
176 
177   PetscFunctionBegin;
178   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
179   if (A->factortype==MAT_FACTOR_NONE) {
180     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
181                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182     if (flg) {
183       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
184     }
185   }
186   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
187                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
188   if (flg) {
189     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
190   }
191   ierr = PetscOptionsTail();CHKERRQ(ierr);
192   PetscFunctionReturn(0);
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248   cudaError_t                       cerr;
249 
250   PetscFunctionBegin;
251   if (!n) PetscFunctionReturn(0);
252   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353   cudaError_t                       cerr;
354 
355   PetscFunctionBegin;
356   if (!n) PetscFunctionReturn(0);
357   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358     try {
359       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360       nzUpper = adiag[0]-adiag[n];
361 
362       /* Allocate Space for the upper triangular matrix */
363       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
366 
367       /* Fill the upper triangular matrix */
368       AiUp[0]=(PetscInt) 0;
369       AiUp[n]=nzUpper;
370       offset = nzUpper;
371       for (i=n-1; i>=0; i--) {
372         v  = aa + adiag[i+1] + 1;
373         vi = aj + adiag[i+1] + 1;
374 
375         /* number of elements NOT on the diagonal */
376         nz = adiag[i] - adiag[i+1]-1;
377 
378         /* decrement the offset */
379         offset -= (nz+1);
380 
381         /* first, set the diagonal elements */
382         AjUp[offset] = (PetscInt) i;
383         AAUp[offset] = (MatScalar)1./v[nz];
384         AiUp[i]      = AiUp[i+1] - (nz+1);
385 
386         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
387         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
388       }
389 
390       /* allocate space for the triangular factor information */
391       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
392 
393       /* Create the matrix description */
394       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
399 
400       /* Create the solve analysis information */
401       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       upTriFactor->csrMat = new CsrMatrix;
408       upTriFactor->csrMat->num_rows = n;
409       upTriFactor->csrMat->num_cols = n;
410       upTriFactor->csrMat->num_entries = nzUpper;
411 
412       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
414 
415       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
417 
418       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
420 
421       /* perform the solve analysis */
422       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
426 
427       /* assign the pointer. Is this really necessary? */
428       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
429 
430       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
434     } catch(char *ex) {
435       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436     }
437   }
438   PetscFunctionReturn(0);
439 }
440 
441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442 {
443   PetscErrorCode               ierr;
444   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446   IS                           isrow = a->row,iscol = a->icol;
447   PetscBool                    row_identity,col_identity;
448   const PetscInt               *r,*c;
449   PetscInt                     n = A->rmap->n;
450 
451   PetscFunctionBegin;
452   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
453   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
454   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
455 
456   cusparseTriFactors->workVector = new THRUSTARRAY(n);
457   cusparseTriFactors->nnz=a->nz;
458 
459   A->offloadmask = PETSC_OFFLOAD_BOTH;
460   /* lower triangular indices */
461   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
462   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
463   if (!row_identity) {
464     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
465     cusparseTriFactors->rpermIndices->assign(r, r+n);
466   }
467   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
468 
469   /* upper triangular indices */
470   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
471   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
472   if (!col_identity) {
473     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
474     cusparseTriFactors->cpermIndices->assign(c, c+n);
475   }
476 
477   if (!row_identity && !col_identity) {
478     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
479   } else if(!row_identity) {
480     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
481   } else if(!col_identity) {
482     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
483   }
484 
485   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
486   PetscFunctionReturn(0);
487 }
488 
489 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
490 {
491   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
492   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
493   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
494   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
495   cusparseStatus_t                  stat;
496   PetscErrorCode                    ierr;
497   cudaError_t                       cerr;
498   PetscInt                          *AiUp, *AjUp;
499   PetscScalar                       *AAUp;
500   PetscScalar                       *AALo;
501   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
502   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
503   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
504   const MatScalar                   *aa = b->a,*v;
505 
506   PetscFunctionBegin;
507   if (!n) PetscFunctionReturn(0);
508   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
509     try {
510       /* Allocate Space for the upper triangular matrix */
511       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
512       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
513       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
514       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
515 
516       /* Fill the upper triangular matrix */
517       AiUp[0]=(PetscInt) 0;
518       AiUp[n]=nzUpper;
519       offset = 0;
520       for (i=0; i<n; i++) {
521         /* set the pointers */
522         v  = aa + ai[i];
523         vj = aj + ai[i];
524         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
525 
526         /* first, set the diagonal elements */
527         AjUp[offset] = (PetscInt) i;
528         AAUp[offset] = (MatScalar)1.0/v[nz];
529         AiUp[i]      = offset;
530         AALo[offset] = (MatScalar)1.0/v[nz];
531 
532         offset+=1;
533         if (nz>0) {
534           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
535           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
536           for (j=offset; j<offset+nz; j++) {
537             AAUp[j] = -AAUp[j];
538             AALo[j] = AAUp[j]/v[nz];
539           }
540           offset+=nz;
541         }
542       }
543 
544       /* allocate space for the triangular factor information */
545       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
546 
547       /* Create the matrix description */
548       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
549       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
550       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
551       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
552       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
553 
554       /* Create the solve analysis information */
555       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
556 
557       /* set the operation */
558       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
559 
560       /* set the matrix */
561       upTriFactor->csrMat = new CsrMatrix;
562       upTriFactor->csrMat->num_rows = A->rmap->n;
563       upTriFactor->csrMat->num_cols = A->cmap->n;
564       upTriFactor->csrMat->num_entries = a->nz;
565 
566       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
567       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
568 
569       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
570       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
571 
572       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
573       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
574 
575       /* perform the solve analysis */
576       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
577                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
578                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
579                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
580 
581       /* assign the pointer. Is this really necessary? */
582       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
583 
584       /* allocate space for the triangular factor information */
585       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
586 
587       /* Create the matrix description */
588       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
589       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
590       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
591       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
592       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
593 
594       /* Create the solve analysis information */
595       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
596 
597       /* set the operation */
598       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
599 
600       /* set the matrix */
601       loTriFactor->csrMat = new CsrMatrix;
602       loTriFactor->csrMat->num_rows = A->rmap->n;
603       loTriFactor->csrMat->num_cols = A->cmap->n;
604       loTriFactor->csrMat->num_entries = a->nz;
605 
606       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
607       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
608 
609       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
610       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
611 
612       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
613       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
614       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
615 
616       /* perform the solve analysis */
617       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
618                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
619                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
620                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
621 
622       /* assign the pointer. Is this really necessary? */
623       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
624 
625       A->offloadmask = PETSC_OFFLOAD_BOTH;
626       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
629       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
630     } catch(char *ex) {
631       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
632     }
633   }
634   PetscFunctionReturn(0);
635 }
636 
637 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
638 {
639   PetscErrorCode               ierr;
640   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
641   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
642   IS                           ip = a->row;
643   const PetscInt               *rip;
644   PetscBool                    perm_identity;
645   PetscInt                     n = A->rmap->n;
646 
647   PetscFunctionBegin;
648   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
649   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
650   cusparseTriFactors->workVector = new THRUSTARRAY(n);
651   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
652 
653   /* lower triangular indices */
654   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
655   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
656   if (!perm_identity) {
657     IS             iip;
658     const PetscInt *irip;
659 
660     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
661     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
662     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
664     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
665     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
666     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
667     ierr = ISDestroy(&iip);CHKERRQ(ierr);
668     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
669   }
670   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
675 {
676   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
677   IS             isrow = b->row,iscol = b->col;
678   PetscBool      row_identity,col_identity;
679   PetscErrorCode ierr;
680 
681   PetscFunctionBegin;
682   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
683   B->offloadmask = PETSC_OFFLOAD_CPU;
684   /* determine which version of MatSolve needs to be used. */
685   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
686   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687   if (row_identity && col_identity) {
688     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
689     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
690     B->ops->matsolve = NULL;
691     B->ops->matsolvetranspose = NULL;
692   } else {
693     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
694     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
695     B->ops->matsolve = NULL;
696     B->ops->matsolvetranspose = NULL;
697   }
698 
699   /* get the triangular factors */
700   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
701   PetscFunctionReturn(0);
702 }
703 
704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
705 {
706   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
707   IS             ip = b->row;
708   PetscBool      perm_identity;
709   PetscErrorCode ierr;
710 
711   PetscFunctionBegin;
712   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
713   B->offloadmask = PETSC_OFFLOAD_CPU;
714   /* determine which version of MatSolve needs to be used. */
715   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
716   if (perm_identity) {
717     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
718     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
719     B->ops->matsolve = NULL;
720     B->ops->matsolvetranspose = NULL;
721   } else {
722     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
723     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
724     B->ops->matsolve = NULL;
725     B->ops->matsolvetranspose = NULL;
726   }
727 
728   /* get the triangular factors */
729   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
734 {
735   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
736   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
737   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
738   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
739   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
740   cusparseStatus_t                  stat;
741   cusparseIndexBase_t               indexBase;
742   cusparseMatrixType_t              matrixType;
743   cusparseFillMode_t                fillMode;
744   cusparseDiagType_t                diagType;
745 
746   PetscFunctionBegin;
747 
748   /*********************************************/
749   /* Now the Transpose of the Lower Tri Factor */
750   /*********************************************/
751 
752   /* allocate space for the transpose of the lower triangular factor */
753   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
754 
755   /* set the matrix descriptors of the lower triangular factor */
756   matrixType = cusparseGetMatType(loTriFactor->descr);
757   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
758   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
759     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
760   diagType = cusparseGetMatDiagType(loTriFactor->descr);
761 
762   /* Create the matrix description */
763   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
764   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
765   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
766   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
767   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
768 
769   /* Create the solve analysis information */
770   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
771 
772   /* set the operation */
773   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
774 
775   /* allocate GPU space for the CSC of the lower triangular factor*/
776   loTriFactorT->csrMat = new CsrMatrix;
777   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
778   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
779   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
780   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
781   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
782   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
783 
784   /* compute the transpose of the lower triangular factor, i.e. the CSC */
785   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
786                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
787                           loTriFactor->csrMat->values->data().get(),
788                           loTriFactor->csrMat->row_offsets->data().get(),
789                           loTriFactor->csrMat->column_indices->data().get(),
790                           loTriFactorT->csrMat->values->data().get(),
791                           loTriFactorT->csrMat->column_indices->data().get(),
792                           loTriFactorT->csrMat->row_offsets->data().get(),
793                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
794 
795   /* perform the solve analysis on the transposed matrix */
796   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
797                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
798                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
799                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
800                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
801 
802   /* assign the pointer. Is this really necessary? */
803   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
804 
805   /*********************************************/
806   /* Now the Transpose of the Upper Tri Factor */
807   /*********************************************/
808 
809   /* allocate space for the transpose of the upper triangular factor */
810   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
811 
812   /* set the matrix descriptors of the upper triangular factor */
813   matrixType = cusparseGetMatType(upTriFactor->descr);
814   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
815   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
816     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
817   diagType = cusparseGetMatDiagType(upTriFactor->descr);
818 
819   /* Create the matrix description */
820   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
821   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
822   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
823   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
824   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
825 
826   /* Create the solve analysis information */
827   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
828 
829   /* set the operation */
830   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
831 
832   /* allocate GPU space for the CSC of the upper triangular factor*/
833   upTriFactorT->csrMat = new CsrMatrix;
834   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
835   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
836   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
837   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
838   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
839   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
840 
841   /* compute the transpose of the upper triangular factor, i.e. the CSC */
842   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
843                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
844                           upTriFactor->csrMat->values->data().get(),
845                           upTriFactor->csrMat->row_offsets->data().get(),
846                           upTriFactor->csrMat->column_indices->data().get(),
847                           upTriFactorT->csrMat->values->data().get(),
848                           upTriFactorT->csrMat->column_indices->data().get(),
849                           upTriFactorT->csrMat->row_offsets->data().get(),
850                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
851 
852   /* perform the solve analysis on the transposed matrix */
853   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
854                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
855                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
856                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
857                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
858 
859   /* assign the pointer. Is this really necessary? */
860   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
861   PetscFunctionReturn(0);
862 }
863 
864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
865 {
866   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
867   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
868   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
869   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
870   cusparseStatus_t             stat;
871   cusparseIndexBase_t          indexBase;
872   cudaError_t                  err;
873   PetscErrorCode               ierr;
874 
875   PetscFunctionBegin;
876 
877   /* allocate space for the triangular factor information */
878   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
879   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
880   indexBase = cusparseGetMatIndexBase(matstruct->descr);
881   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
882   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
883 
884   /* set alpha and beta */
885   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
886   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
887   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
888   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
889   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
890   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
891   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
892 
893   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
894     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
895     CsrMatrix *matrixT= new CsrMatrix;
896     matrixT->num_rows = A->cmap->n;
897     matrixT->num_cols = A->rmap->n;
898     matrixT->num_entries = a->nz;
899     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
900     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
901     matrixT->values = new THRUSTARRAY(a->nz);
902 
903     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
904     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
905     /* compute the transpose, i.e. the CSC */
906     indexBase = cusparseGetMatIndexBase(matstruct->descr);
907     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
908                             A->cmap->n, matrix->num_entries,
909                             matrix->values->data().get(),
910                             cusparsestruct->rowoffsets_gpu->data().get(),
911                             matrix->column_indices->data().get(),
912                             matrixT->values->data().get(),
913                             matrixT->column_indices->data().get(),
914                             matrixT->row_offsets->data().get(),
915                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
916     /* assign the pointer */
917     matstructT->mat = matrixT;
918     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
919   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
920     /* First convert HYB to CSR */
921     CsrMatrix *temp= new CsrMatrix;
922     temp->num_rows = A->rmap->n;
923     temp->num_cols = A->cmap->n;
924     temp->num_entries = a->nz;
925     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
926     temp->column_indices = new THRUSTINTARRAY32(a->nz);
927     temp->values = new THRUSTARRAY(a->nz);
928 
929 
930     stat = cusparse_hyb2csr(cusparsestruct->handle,
931                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
932                             temp->values->data().get(),
933                             temp->row_offsets->data().get(),
934                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
935 
936     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
937     CsrMatrix *tempT= new CsrMatrix;
938     tempT->num_rows = A->rmap->n;
939     tempT->num_cols = A->cmap->n;
940     tempT->num_entries = a->nz;
941     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
942     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
943     tempT->values = new THRUSTARRAY(a->nz);
944 
945     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
946                             temp->num_cols, temp->num_entries,
947                             temp->values->data().get(),
948                             temp->row_offsets->data().get(),
949                             temp->column_indices->data().get(),
950                             tempT->values->data().get(),
951                             tempT->column_indices->data().get(),
952                             tempT->row_offsets->data().get(),
953                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
954 
955     /* Last, convert CSC to HYB */
956     cusparseHybMat_t hybMat;
957     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
958     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
959       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
960     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
961                             matstructT->descr, tempT->values->data().get(),
962                             tempT->row_offsets->data().get(),
963                             tempT->column_indices->data().get(),
964                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
965 
966     /* assign the pointer */
967     matstructT->mat = hybMat;
968     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
969 
970     /* delete temporaries */
971     if (tempT) {
972       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
973       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
974       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
975       delete (CsrMatrix*) tempT;
976     }
977     if (temp) {
978       if (temp->values) delete (THRUSTARRAY*) temp->values;
979       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
980       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
981       delete (CsrMatrix*) temp;
982     }
983   }
984   /* the compressed row indices is not used for matTranspose */
985   matstructT->cprowIndices = NULL;
986   /* assign the pointer */
987   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
988   PetscFunctionReturn(0);
989 }
990 
991 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
992 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
993 {
994   PetscInt                              n = xx->map->n;
995   const PetscScalar                     *barray;
996   PetscScalar                           *xarray;
997   thrust::device_ptr<const PetscScalar> bGPU;
998   thrust::device_ptr<PetscScalar>       xGPU;
999   cusparseStatus_t                      stat;
1000   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1002   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1003   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1004   PetscErrorCode                        ierr;
1005   cudaError_t                           cerr;
1006 
1007   PetscFunctionBegin;
1008   /* Analyze the matrix and create the transpose ... on the fly */
1009   if (!loTriFactorT && !upTriFactorT) {
1010     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1011     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1012     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1013   }
1014 
1015   /* Get the GPU pointers */
1016   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1017   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1018   xGPU = thrust::device_pointer_cast(xarray);
1019   bGPU = thrust::device_pointer_cast(barray);
1020 
1021   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1022   /* First, reorder with the row permutation */
1023   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1024                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1025                xGPU);
1026 
1027   /* First, solve U */
1028   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1029                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1030                         upTriFactorT->csrMat->values->data().get(),
1031                         upTriFactorT->csrMat->row_offsets->data().get(),
1032                         upTriFactorT->csrMat->column_indices->data().get(),
1033                         upTriFactorT->solveInfo,
1034                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1035 
1036   /* Then, solve L */
1037   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1038                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1039                         loTriFactorT->csrMat->values->data().get(),
1040                         loTriFactorT->csrMat->row_offsets->data().get(),
1041                         loTriFactorT->csrMat->column_indices->data().get(),
1042                         loTriFactorT->solveInfo,
1043                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1044 
1045   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1046   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1047                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1048                tempGPU->begin());
1049 
1050   /* Copy the temporary to the full solution. */
1051   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1052 
1053   /* restore */
1054   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1055   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1056   cerr = WaitForGPU();CHKERRCUDA(cerr);
1057   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1058   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1063 {
1064   const PetscScalar                 *barray;
1065   PetscScalar                       *xarray;
1066   cusparseStatus_t                  stat;
1067   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1068   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1070   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1071   PetscErrorCode                    ierr;
1072   cudaError_t                       cerr;
1073 
1074   PetscFunctionBegin;
1075   /* Analyze the matrix and create the transpose ... on the fly */
1076   if (!loTriFactorT && !upTriFactorT) {
1077     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1078     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080   }
1081 
1082   /* Get the GPU pointers */
1083   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1084   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1085 
1086   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1087   /* First, solve U */
1088   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1089                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1090                         upTriFactorT->csrMat->values->data().get(),
1091                         upTriFactorT->csrMat->row_offsets->data().get(),
1092                         upTriFactorT->csrMat->column_indices->data().get(),
1093                         upTriFactorT->solveInfo,
1094                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1095 
1096   /* Then, solve L */
1097   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1098                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1099                         loTriFactorT->csrMat->values->data().get(),
1100                         loTriFactorT->csrMat->row_offsets->data().get(),
1101                         loTriFactorT->csrMat->column_indices->data().get(),
1102                         loTriFactorT->solveInfo,
1103                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1104 
1105   /* restore */
1106   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1107   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1108   cerr = WaitForGPU();CHKERRCUDA(cerr);
1109   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1110   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1115 {
1116   const PetscScalar                     *barray;
1117   PetscScalar                           *xarray;
1118   thrust::device_ptr<const PetscScalar> bGPU;
1119   thrust::device_ptr<PetscScalar>       xGPU;
1120   cusparseStatus_t                      stat;
1121   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1122   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1123   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1124   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1125   PetscErrorCode                        ierr;
1126   cudaError_t                           cerr;
1127 
1128   PetscFunctionBegin;
1129 
1130   /* Get the GPU pointers */
1131   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1132   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1133   xGPU = thrust::device_pointer_cast(xarray);
1134   bGPU = thrust::device_pointer_cast(barray);
1135 
1136   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1137   /* First, reorder with the row permutation */
1138   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1139                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1140                tempGPU->begin());
1141 
1142   /* Next, solve L */
1143   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1144                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1145                         loTriFactor->csrMat->values->data().get(),
1146                         loTriFactor->csrMat->row_offsets->data().get(),
1147                         loTriFactor->csrMat->column_indices->data().get(),
1148                         loTriFactor->solveInfo,
1149                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1150 
1151   /* Then, solve U */
1152   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1153                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1154                         upTriFactor->csrMat->values->data().get(),
1155                         upTriFactor->csrMat->row_offsets->data().get(),
1156                         upTriFactor->csrMat->column_indices->data().get(),
1157                         upTriFactor->solveInfo,
1158                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1159 
1160   /* Last, reorder with the column permutation */
1161   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1162                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1163                xGPU);
1164 
1165   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1166   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1167   cerr = WaitForGPU();CHKERRCUDA(cerr);
1168   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1169   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1174 {
1175   const PetscScalar                 *barray;
1176   PetscScalar                       *xarray;
1177   cusparseStatus_t                  stat;
1178   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1179   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1180   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1181   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1182   PetscErrorCode                    ierr;
1183   cudaError_t                       cerr;
1184 
1185   PetscFunctionBegin;
1186   /* Get the GPU pointers */
1187   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1188   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1189 
1190   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1191   /* First, solve L */
1192   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1193                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1194                         loTriFactor->csrMat->values->data().get(),
1195                         loTriFactor->csrMat->row_offsets->data().get(),
1196                         loTriFactor->csrMat->column_indices->data().get(),
1197                         loTriFactor->solveInfo,
1198                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1199 
1200   /* Next, solve U */
1201   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1202                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1203                         upTriFactor->csrMat->values->data().get(),
1204                         upTriFactor->csrMat->row_offsets->data().get(),
1205                         upTriFactor->csrMat->column_indices->data().get(),
1206                         upTriFactor->solveInfo,
1207                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1208 
1209   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1210   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1211   cerr = WaitForGPU();CHKERRCUDA(cerr);
1212   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1213   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1218 {
1219   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1220   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1221   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1222   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1223   PetscErrorCode               ierr;
1224   cusparseStatus_t             stat;
1225   cudaError_t                  err;
1226 
1227   PetscFunctionBegin;
1228   if (A->boundtocpu) PetscFunctionReturn(0);
1229   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1230     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1231     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232       /* Copy values only */
1233       CsrMatrix *mat,*matT;
1234       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1235       mat->values->assign(a->a, a->a+a->nz);
1236       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1237 
1238       /* Update matT when it was built before */
1239       if (cusparsestruct->matTranspose) {
1240         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1241         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1242         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1243                                  A->cmap->n, mat->num_entries,
1244                                  mat->values->data().get(),
1245                                  cusparsestruct->rowoffsets_gpu->data().get(),
1246                                  mat->column_indices->data().get(),
1247                                  matT->values->data().get(),
1248                                  matT->column_indices->data().get(),
1249                                  matT->row_offsets->data().get(),
1250                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1251       }
1252     } else {
1253       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1254       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1255       delete cusparsestruct->workVector;
1256       delete cusparsestruct->rowoffsets_gpu;
1257       try {
1258         if (a->compressedrow.use) {
1259           m    = a->compressedrow.nrows;
1260           ii   = a->compressedrow.i;
1261           ridx = a->compressedrow.rindex;
1262         } else {
1263           m    = A->rmap->n;
1264           ii   = a->i;
1265           ridx = NULL; /* In this case, one should not dereference ridx */
1266         }
1267         cusparsestruct->nrows = m;
1268 
1269         /* allocate space for the triangular factor information */
1270         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1271         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1272         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1273         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1274 
1275         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1276         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1277         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1278         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1279         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1280         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1281         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1282 
1283         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1284         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1285           /* set the matrix */
1286           CsrMatrix *matrix= new CsrMatrix;
1287           matrix->num_rows = m;
1288           matrix->num_cols = A->cmap->n;
1289           matrix->num_entries = a->nz;
1290           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1291           matrix->row_offsets->assign(ii, ii + m+1);
1292 
1293           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1294           matrix->column_indices->assign(a->j, a->j+a->nz);
1295 
1296           matrix->values = new THRUSTARRAY(a->nz);
1297           matrix->values->assign(a->a, a->a+a->nz);
1298 
1299           /* assign the pointer */
1300           matstruct->mat = matrix;
1301 
1302         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1303           CsrMatrix *matrix= new CsrMatrix;
1304           matrix->num_rows = m;
1305           matrix->num_cols = A->cmap->n;
1306           matrix->num_entries = a->nz;
1307           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1308           matrix->row_offsets->assign(ii, ii + m+1);
1309 
1310           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1311           matrix->column_indices->assign(a->j, a->j+a->nz);
1312 
1313           matrix->values = new THRUSTARRAY(a->nz);
1314           matrix->values->assign(a->a, a->a+a->nz);
1315 
1316           cusparseHybMat_t hybMat;
1317           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1318           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1319             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1320           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1321               matstruct->descr, matrix->values->data().get(),
1322               matrix->row_offsets->data().get(),
1323               matrix->column_indices->data().get(),
1324               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1325           /* assign the pointer */
1326           matstruct->mat = hybMat;
1327 
1328           if (matrix) {
1329             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1330             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1331             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1332             delete (CsrMatrix*)matrix;
1333           }
1334         }
1335 
1336         /* assign the compressed row indices */
1337         if (a->compressedrow.use) {
1338           cusparsestruct->workVector = new THRUSTARRAY(m);
1339           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1340           matstruct->cprowIndices->assign(ridx,ridx+m);
1341           tmp = m;
1342         } else {
1343           cusparsestruct->workVector = NULL;
1344           matstruct->cprowIndices    = NULL;
1345           tmp = 0;
1346         }
1347         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1348 
1349         /* assign the pointer */
1350         cusparsestruct->mat = matstruct;
1351       } catch(char *ex) {
1352         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1353       }
1354       cusparsestruct->nonzerostate = A->nonzerostate;
1355     }
1356     err  = WaitForGPU();CHKERRCUDA(err);
1357     A->offloadmask = PETSC_OFFLOAD_BOTH;
1358     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1359   }
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 struct VecCUDAPlusEquals
1364 {
1365   template <typename Tuple>
1366   __host__ __device__
1367   void operator()(Tuple t)
1368   {
1369     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1370   }
1371 };
1372 
1373 typedef struct {
1374   PetscBool   cisdense;
1375   PetscScalar *Bt;
1376   Mat         X;
1377 } MatMatCusparse;
1378 
1379 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1380 {
1381   PetscErrorCode ierr;
1382   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1383   cudaError_t    cerr;
1384 
1385   PetscFunctionBegin;
1386   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1387   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1388   ierr = PetscFree(data);CHKERRQ(ierr);
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1393 
1394 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1395 {
1396   Mat_Product                  *product = C->product;
1397   Mat                          A,B;
1398   PetscInt                     m,n,k,blda,clda;
1399   PetscBool                    flg,biscuda;
1400   Mat_SeqAIJCUSPARSE           *cusp;
1401   cusparseStatus_t             stat;
1402   cusparseOperation_t          opA;
1403   cusparseMatDescr_t           matA;
1404   const PetscScalar            *barray;
1405   PetscScalar                  *carray;
1406   PetscErrorCode               ierr;
1407   MatMatCusparse               *mmdata;
1408   Mat_SeqAIJCUSPARSEMultStruct *mat;
1409   CsrMatrix                    *csrmat;
1410 
1411   PetscFunctionBegin;
1412   MatCheckProduct(C,1);
1413   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1414   mmdata = (MatMatCusparse*)product->data;
1415   A    = product->A;
1416   B    = product->B;
1417   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1418   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1419   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1420      Instead of silently accepting the wrong answer, I prefer to raise the error */
1421   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1422   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1423   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1424   switch (product->type) {
1425   case MATPRODUCT_AB:
1426   case MATPRODUCT_PtAP:
1427     mat = cusp->mat;
1428     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1429     m   = A->rmap->n;
1430     k   = A->cmap->n;
1431     n   = B->cmap->n;
1432     break;
1433   case MATPRODUCT_AtB:
1434     if (!cusp->matTranspose) {
1435       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1436     }
1437     mat = cusp->matTranspose;
1438     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1439     m   = A->cmap->n;
1440     k   = A->rmap->n;
1441     n   = B->cmap->n;
1442     break;
1443   case MATPRODUCT_ABt:
1444   case MATPRODUCT_RARt:
1445     mat = cusp->mat;
1446     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1447     m   = A->rmap->n;
1448     k   = B->cmap->n;
1449     n   = B->rmap->n;
1450     break;
1451     break;
1452   default:
1453     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1454   }
1455   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1456   matA   = mat->descr;
1457   csrmat = (CsrMatrix*)mat->mat;
1458   /* if the user passed a CPU matrix, copy the data to the GPU */
1459   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1460   if (!biscuda) {
1461     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1462   }
1463   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1464   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1465   /* cusparse spmm does not support transpose on B */
1466   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1467     cublasHandle_t cublasv2handle;
1468     cublasStatus_t cerr;
1469 
1470     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1471     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1472                        B->cmap->n,B->rmap->n,
1473                        &PETSC_CUSPARSE_ONE ,barray,blda,
1474                        &PETSC_CUSPARSE_ZERO,barray,blda,
1475                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1476     blda = B->cmap->n;
1477   }
1478   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1479     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1480     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1481   } else {
1482     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1483     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1484   }
1485 
1486   /* perform the MatMat operation */
1487   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1488                            csrmat->num_entries,mat->alpha,matA,
1489                            csrmat->values->data().get(),
1490                            csrmat->row_offsets->data().get(),
1491                            csrmat->column_indices->data().get(),
1492                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1493                            carray,clda);CHKERRCUSPARSE(stat);
1494 
1495   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1496   if (product->type == MATPRODUCT_RARt) {
1497     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1498     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1499   } else if (product->type == MATPRODUCT_PtAP) {
1500     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1501     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1502   } else {
1503     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1504   }
1505   if (mmdata->cisdense) {
1506     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1507   }
1508   if (!biscuda) {
1509     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1510   }
1511   PetscFunctionReturn(0);
1512 }
1513 
1514 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1515 {
1516   Mat_Product        *product = C->product;
1517   Mat                A,B;
1518   PetscInt           m,n;
1519   PetscBool          cisdense,flg;
1520   PetscErrorCode     ierr;
1521   MatMatCusparse     *mmdata;
1522   Mat_SeqAIJCUSPARSE *cusp;
1523 
1524   PetscFunctionBegin;
1525   MatCheckProduct(C,1);
1526   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1527   A    = product->A;
1528   B    = product->B;
1529   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1530   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1531   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1532   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1533   switch (product->type) {
1534   case MATPRODUCT_AB:
1535     m = A->rmap->n;
1536     n = B->cmap->n;
1537     break;
1538   case MATPRODUCT_AtB:
1539     m = A->cmap->n;
1540     n = B->cmap->n;
1541     break;
1542   case MATPRODUCT_ABt:
1543     m = A->rmap->n;
1544     n = B->rmap->n;
1545     break;
1546   case MATPRODUCT_PtAP:
1547     m = B->cmap->n;
1548     n = B->cmap->n;
1549     break;
1550   case MATPRODUCT_RARt:
1551     m = B->rmap->n;
1552     n = B->rmap->n;
1553     break;
1554   default:
1555     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1556   }
1557   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1558   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1559   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1560   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1561 
1562   /* product data */
1563   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1564   mmdata->cisdense = cisdense;
1565   /* cusparse spmm does not support transpose on B */
1566   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1567     cudaError_t cerr;
1568 
1569     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1570   }
1571   /* for these products we need intermediate storage */
1572   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1573     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1574     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1575     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1576       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1577     } else {
1578       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1579     }
1580   }
1581   C->product->data    = mmdata;
1582   C->product->destroy = MatDestroy_MatMatCusparse;
1583 
1584   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1589 
1590 /* handles dense B */
1591 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1592 {
1593   Mat_Product    *product = C->product;
1594   PetscErrorCode ierr;
1595 
1596   PetscFunctionBegin;
1597   MatCheckProduct(C,1);
1598   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1599   if (product->A->boundtocpu) {
1600     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1601     PetscFunctionReturn(0);
1602   }
1603   switch (product->type) {
1604   case MATPRODUCT_AB:
1605   case MATPRODUCT_AtB:
1606   case MATPRODUCT_ABt:
1607   case MATPRODUCT_PtAP:
1608   case MATPRODUCT_RARt:
1609     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1610   default:
1611     break;
1612   }
1613   PetscFunctionReturn(0);
1614 }
1615 
1616 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1617 {
1618   PetscErrorCode ierr;
1619 
1620   PetscFunctionBegin;
1621   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1626 {
1627   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1628   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1629   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1630   const PetscScalar            *xarray;
1631   PetscScalar                  *yarray;
1632   PetscErrorCode               ierr;
1633   cudaError_t                  cerr;
1634   cusparseStatus_t             stat;
1635 
1636   PetscFunctionBegin;
1637   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1638   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1639   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1640   if (!matstructT) {
1641     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1642     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1643   }
1644   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1645   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1646   if (yy->map->n) {
1647     PetscInt                     n = yy->map->n;
1648     cudaError_t                  err;
1649     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1650   }
1651 
1652   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1653   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1654     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1655     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1656                              mat->num_rows, mat->num_cols,
1657                              mat->num_entries, matstructT->alpha, matstructT->descr,
1658                              mat->values->data().get(), mat->row_offsets->data().get(),
1659                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1660                              yarray);CHKERRCUSPARSE(stat);
1661   } else {
1662     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1663     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1664                              matstructT->alpha, matstructT->descr, hybMat,
1665                              xarray, matstructT->beta_zero,
1666                              yarray);CHKERRCUSPARSE(stat);
1667   }
1668   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1669   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1670   cerr = WaitForGPU();CHKERRCUDA(cerr);
1671   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1672   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1673   PetscFunctionReturn(0);
1674 }
1675 
1676 
1677 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1678 {
1679   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1680   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1681   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1682   const PetscScalar            *xarray;
1683   PetscScalar                  *zarray,*dptr,*beta;
1684   PetscErrorCode               ierr;
1685   cudaError_t                  cerr;
1686   cusparseStatus_t             stat;
1687   PetscBool                    compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1688 
1689   PetscFunctionBegin;
1690   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1691   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1692   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1693   try {
1694     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1695 
1696     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1697     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1698 
1699     dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */
1700     beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1701 
1702     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1703     /* csr_spmv does y = alpha*Ax + beta*y */
1704     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1705       /* here we need to be careful to set the number of rows in the multiply to the
1706          number of compressed rows in the matrix ... which is equivalent to the
1707          size of the workVector */
1708       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1709       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1710                                mat->num_rows, mat->num_cols,
1711                                mat->num_entries, matstruct->alpha, matstruct->descr,
1712                                mat->values->data().get(), mat->row_offsets->data().get(),
1713                                mat->column_indices->data().get(), xarray, beta,
1714                                dptr);CHKERRCUSPARSE(stat);
1715     } else {
1716       if (cusparsestruct->nrows) {
1717         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1718         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1719                                  matstruct->alpha, matstruct->descr, hybMat,
1720                                  xarray, beta,
1721                                  dptr);CHKERRCUSPARSE(stat);
1722       }
1723     }
1724     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1725 
1726     if (yy) { /* MatMultAdd: zz = A*xx + yy */
1727       if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1728         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1729       } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */
1730         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1731       }
1732     } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1733       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1734     }
1735 
1736     /* ScatterAdd the result from work vector into the full vector when A is compressed */
1737     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1738     if (compressed) {
1739       thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1740       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1741                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1742                        VecCUDAPlusEquals());
1743     }
1744     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1745     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1746     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1747     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1748   } catch(char *ex) {
1749     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1750   }
1751   cerr = WaitForGPU();CHKERRCUDA(cerr);
1752   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1757 {
1758   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1759   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1760   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1761   const PetscScalar               *xarray;
1762   PetscScalar                     *zarray,*beta;
1763   PetscErrorCode                  ierr;
1764   cudaError_t                     cerr;
1765   cusparseStatus_t                stat;
1766 
1767   PetscFunctionBegin;
1768   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1769   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1770   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1771   if (!matstructT) {
1772     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1773     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1774   }
1775 
1776   /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1777   try {
1778     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1779     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1780     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1781     beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;
1782 
1783     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1784     /* multiply add with matrix transpose */
1785     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1786       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1787       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1788                                mat->num_rows, mat->num_cols,
1789                                mat->num_entries, matstructT->alpha, matstructT->descr,
1790                                mat->values->data().get(), mat->row_offsets->data().get(),
1791                                mat->column_indices->data().get(), xarray, beta,
1792                                zarray);CHKERRCUSPARSE(stat);
1793     } else {
1794       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1795       if (cusparsestruct->nrows) {
1796         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1797                                  matstructT->alpha, matstructT->descr, hybMat,
1798                                  xarray, beta,
1799                                  zarray);CHKERRCUSPARSE(stat);
1800       }
1801     }
1802     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1803 
1804     if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);}
1805     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1806     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1807   } catch(char *ex) {
1808     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1809   }
1810   cerr = WaitForGPU();CHKERRCUDA(cerr);
1811   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1816 {
1817   PetscErrorCode ierr;
1818 
1819   PetscFunctionBegin;
1820   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1821   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1822   if (A->factortype == MAT_FACTOR_NONE) {
1823     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1824   }
1825   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1826   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1827   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1828   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1829   PetscFunctionReturn(0);
1830 }
1831 
1832 /* --------------------------------------------------------------------------------*/
1833 /*@
1834    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1835    (the default parallel PETSc format). This matrix will ultimately pushed down
1836    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1837    assembly performance the user should preallocate the matrix storage by setting
1838    the parameter nz (or the array nnz).  By setting these parameters accurately,
1839    performance during matrix assembly can be increased by more than a factor of 50.
1840 
1841    Collective
1842 
1843    Input Parameters:
1844 +  comm - MPI communicator, set to PETSC_COMM_SELF
1845 .  m - number of rows
1846 .  n - number of columns
1847 .  nz - number of nonzeros per row (same for all rows)
1848 -  nnz - array containing the number of nonzeros in the various rows
1849          (possibly different for each row) or NULL
1850 
1851    Output Parameter:
1852 .  A - the matrix
1853 
1854    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1855    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1856    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1857 
1858    Notes:
1859    If nnz is given then nz is ignored
1860 
1861    The AIJ format (also called the Yale sparse matrix format or
1862    compressed row storage), is fully compatible with standard Fortran 77
1863    storage.  That is, the stored row and column indices can begin at
1864    either one (as in Fortran) or zero.  See the users' manual for details.
1865 
1866    Specify the preallocated storage with either nz or nnz (not both).
1867    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1868    allocation.  For large problems you MUST preallocate memory or you
1869    will get TERRIBLE performance, see the users' manual chapter on matrices.
1870 
1871    By default, this format uses inodes (identical nodes) when possible, to
1872    improve numerical efficiency of matrix-vector products and solves. We
1873    search for consecutive rows with the same nonzero structure, thereby
1874    reusing matrix information to achieve increased efficiency.
1875 
1876    Level: intermediate
1877 
1878 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1879 @*/
1880 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1881 {
1882   PetscErrorCode ierr;
1883 
1884   PetscFunctionBegin;
1885   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1886   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1887   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1888   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1889   PetscFunctionReturn(0);
1890 }
1891 
1892 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1893 {
1894   PetscErrorCode ierr;
1895 
1896   PetscFunctionBegin;
1897   if (A->factortype == MAT_FACTOR_NONE) {
1898     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1899       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1900     }
1901   } else {
1902     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1903   }
1904   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1905   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1906   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1907   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1908   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1909   PetscFunctionReturn(0);
1910 }
1911 
1912 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1913 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1914 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1915 {
1916   PetscErrorCode ierr;
1917 
1918   PetscFunctionBegin;
1919   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1920   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1921   PetscFunctionReturn(0);
1922 }
1923 
1924 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1925 {
1926   PetscFunctionBegin;
1927   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1928      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1929      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1930      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1931            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1932   /* We need to take care of function composition also */
1933   if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
1934   if (flg) {
1935     A->ops->mult             = MatMult_SeqAIJ;
1936     A->ops->multadd          = MatMultAdd_SeqAIJ;
1937     A->ops->multtranspose    = MatMultTranspose_SeqAIJ;
1938     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1939     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
1940     A->ops->duplicate        = MatDuplicate_SeqAIJ;
1941   } else {
1942     A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1943     A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1944     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1945     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1946     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1947     A->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1948     A->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1949   }
1950   A->boundtocpu = flg;
1951   PetscFunctionReturn(0);
1952 }
1953 
1954 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
1955 {
1956   PetscErrorCode ierr;
1957   cusparseStatus_t stat;
1958   cusparseHandle_t handle=0;
1959 
1960   PetscFunctionBegin;
1961   if (reuse != MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet supported");
1962   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1963   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1964 
1965   if (B->factortype == MAT_FACTOR_NONE) {
1966     /* you cannot check the inode.use flag here since the matrix was just created.
1967        now build a GPU matrix data structure */
1968     B->spptr = new Mat_SeqAIJCUSPARSE;
1969     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1970     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1971     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1972     ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1973     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1974     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1975     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1976     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1977     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1978   } else {
1979     /* NEXT, set the pointers to the triangular factors */
1980     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1981     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1982     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1983     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1984     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1985     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1986     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1987     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1988     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1989     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1990     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1991   }
1992 
1993   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1994   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1995   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1996   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1997   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1998   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1999   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
2000   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
2001   B->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
2002 
2003   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2004 
2005   B->boundtocpu  = PETSC_FALSE;
2006   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2007 
2008   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2009   ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2010   ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2011   PetscFunctionReturn(0);
2012 }
2013 
2014 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2015 {
2016   PetscErrorCode ierr;
2017 
2018   PetscFunctionBegin;
2019   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2020   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2021   PetscFunctionReturn(0);
2022 }
2023 
2024 /*MC
2025    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2026 
2027    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2028    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2029    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2030 
2031    Options Database Keys:
2032 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2033 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2034 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2035 
2036   Level: beginner
2037 
2038 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2039 M*/
2040 
2041 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2042 
2043 
2044 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2045 {
2046   PetscErrorCode ierr;
2047 
2048   PetscFunctionBegin;
2049   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2050   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2051   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2052   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2053   PetscFunctionReturn(0);
2054 }
2055 
2056 
2057 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2058 {
2059   cusparseStatus_t stat;
2060   cusparseHandle_t handle;
2061 
2062   PetscFunctionBegin;
2063   if (*cusparsestruct) {
2064     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2065     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2066     delete (*cusparsestruct)->workVector;
2067     delete (*cusparsestruct)->rowoffsets_gpu;
2068     if (handle = (*cusparsestruct)->handle) {
2069       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2070     }
2071     delete *cusparsestruct;
2072     *cusparsestruct = 0;
2073   }
2074   PetscFunctionReturn(0);
2075 }
2076 
2077 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2078 {
2079   PetscFunctionBegin;
2080   if (*mat) {
2081     delete (*mat)->values;
2082     delete (*mat)->column_indices;
2083     delete (*mat)->row_offsets;
2084     delete *mat;
2085     *mat = 0;
2086   }
2087   PetscFunctionReturn(0);
2088 }
2089 
2090 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2091 {
2092   cusparseStatus_t stat;
2093   PetscErrorCode   ierr;
2094 
2095   PetscFunctionBegin;
2096   if (*trifactor) {
2097     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2098     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2099     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2100     delete *trifactor;
2101     *trifactor = 0;
2102   }
2103   PetscFunctionReturn(0);
2104 }
2105 
2106 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2107 {
2108   CsrMatrix        *mat;
2109   cusparseStatus_t stat;
2110   cudaError_t      err;
2111 
2112   PetscFunctionBegin;
2113   if (*matstruct) {
2114     if ((*matstruct)->mat) {
2115       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2116         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2117         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2118       } else {
2119         mat = (CsrMatrix*)(*matstruct)->mat;
2120         CsrMatrix_Destroy(&mat);
2121       }
2122     }
2123     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2124     delete (*matstruct)->cprowIndices;
2125     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2126     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2127     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2128     delete *matstruct;
2129     *matstruct = 0;
2130   }
2131   PetscFunctionReturn(0);
2132 }
2133 
2134 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2135 {
2136   PetscFunctionBegin;
2137   if (*trifactors) {
2138     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2139     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2140     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2141     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2142     delete (*trifactors)->rpermIndices;
2143     delete (*trifactors)->cpermIndices;
2144     delete (*trifactors)->workVector;
2145     (*trifactors)->rpermIndices = 0;
2146     (*trifactors)->cpermIndices = 0;
2147     (*trifactors)->workVector = 0;
2148   }
2149   PetscFunctionReturn(0);
2150 }
2151 
2152 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2153 {
2154   cusparseHandle_t handle;
2155   cusparseStatus_t stat;
2156 
2157   PetscFunctionBegin;
2158   if (*trifactors) {
2159     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2160     if (handle = (*trifactors)->handle) {
2161       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2162     }
2163     delete *trifactors;
2164     *trifactors = 0;
2165   }
2166   PetscFunctionReturn(0);
2167 }
2168 
2169