xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 3703a6a7d9ceda88ee07755360990e1609c9169a)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
42 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
43 
44 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
45 {
46   cusparseStatus_t   stat;
47   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
48 
49   PetscFunctionBegin;
50   cusparsestruct->stream = stream;
51   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
52   PetscFunctionReturn(0);
53 }
54 
55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
56 {
57   cusparseStatus_t   stat;
58   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
59 
60   PetscFunctionBegin;
61   if (cusparsestruct->handle != handle) {
62     if (cusparsestruct->handle) {
63       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
64     }
65     cusparsestruct->handle = handle;
66   }
67   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
68   PetscFunctionReturn(0);
69 }
70 
71 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
72 {
73   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74 
75   PetscFunctionBegin;
76   if (cusparsestruct->handle) cusparsestruct->handle = 0;
77   PetscFunctionReturn(0);
78 }
79 
80 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
81 {
82   PetscFunctionBegin;
83   *type = MATSOLVERCUSPARSE;
84   PetscFunctionReturn(0);
85 }
86 
87 /*MC
88   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
89   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
90   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
91   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
92   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
93   algorithms are not recommended. This class does NOT support direct solver operations.
94 
95   Level: beginner
96 
97 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
98 M*/
99 
100 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
101 {
102   PetscErrorCode ierr;
103   PetscInt       n = A->rmap->n;
104 
105   PetscFunctionBegin;
106   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
107   (*B)->factortype = ftype;
108   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
109   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
110 
111   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
112     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
113     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
115   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
116     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
118   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
119 
120   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
121   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
122   PetscFunctionReturn(0);
123 }
124 
125 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
126 {
127   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
128 
129   PetscFunctionBegin;
130   switch (op) {
131   case MAT_CUSPARSE_MULT:
132     cusparsestruct->format = format;
133     break;
134   case MAT_CUSPARSE_ALL:
135     cusparsestruct->format = format;
136     break;
137   default:
138     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 /*@
144    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
145    operation. Only the MatMult operation can use different GPU storage formats
146    for MPIAIJCUSPARSE matrices.
147    Not Collective
148 
149    Input Parameters:
150 +  A - Matrix of type SEQAIJCUSPARSE
151 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
152 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
153 
154    Output Parameter:
155 
156    Level: intermediate
157 
158 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
159 @*/
160 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
161 {
162   PetscErrorCode ierr;
163 
164   PetscFunctionBegin;
165   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
166   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
167   PetscFunctionReturn(0);
168 }
169 
170 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
171 {
172   PetscErrorCode           ierr;
173   MatCUSPARSEStorageFormat format;
174   PetscBool                flg;
175   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
176 
177   PetscFunctionBegin;
178   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
179   if (A->factortype==MAT_FACTOR_NONE) {
180     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
181                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182     if (flg) {
183       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
184     }
185   }
186   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
187                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
188   if (flg) {
189     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
190   }
191   ierr = PetscOptionsTail();CHKERRQ(ierr);
192   PetscFunctionReturn(0);
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248   cudaError_t                       cerr;
249 
250   PetscFunctionBegin;
251   if (!n) PetscFunctionReturn(0);
252   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353   cudaError_t                       cerr;
354 
355   PetscFunctionBegin;
356   if (!n) PetscFunctionReturn(0);
357   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358     try {
359       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360       nzUpper = adiag[0]-adiag[n];
361 
362       /* Allocate Space for the upper triangular matrix */
363       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
366 
367       /* Fill the upper triangular matrix */
368       AiUp[0]=(PetscInt) 0;
369       AiUp[n]=nzUpper;
370       offset = nzUpper;
371       for (i=n-1; i>=0; i--) {
372         v  = aa + adiag[i+1] + 1;
373         vi = aj + adiag[i+1] + 1;
374 
375         /* number of elements NOT on the diagonal */
376         nz = adiag[i] - adiag[i+1]-1;
377 
378         /* decrement the offset */
379         offset -= (nz+1);
380 
381         /* first, set the diagonal elements */
382         AjUp[offset] = (PetscInt) i;
383         AAUp[offset] = (MatScalar)1./v[nz];
384         AiUp[i]      = AiUp[i+1] - (nz+1);
385 
386         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
387         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
388       }
389 
390       /* allocate space for the triangular factor information */
391       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
392 
393       /* Create the matrix description */
394       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
399 
400       /* Create the solve analysis information */
401       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       upTriFactor->csrMat = new CsrMatrix;
408       upTriFactor->csrMat->num_rows = n;
409       upTriFactor->csrMat->num_cols = n;
410       upTriFactor->csrMat->num_entries = nzUpper;
411 
412       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
414 
415       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
417 
418       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
420 
421       /* perform the solve analysis */
422       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
426 
427       /* assign the pointer. Is this really necessary? */
428       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
429 
430       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
434     } catch(char *ex) {
435       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436     }
437   }
438   PetscFunctionReturn(0);
439 }
440 
441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442 {
443   PetscErrorCode               ierr;
444   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446   IS                           isrow = a->row,iscol = a->icol;
447   PetscBool                    row_identity,col_identity;
448   const PetscInt               *r,*c;
449   PetscInt                     n = A->rmap->n;
450 
451   PetscFunctionBegin;
452   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
453   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
454   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
455 
456   cusparseTriFactors->workVector = new THRUSTARRAY(n);
457   cusparseTriFactors->nnz=a->nz;
458 
459   A->offloadmask = PETSC_OFFLOAD_BOTH;
460   /* lower triangular indices */
461   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
462   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
463   if (!row_identity) {
464     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
465     cusparseTriFactors->rpermIndices->assign(r, r+n);
466   }
467   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
468 
469   /* upper triangular indices */
470   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
471   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
472   if (!col_identity) {
473     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
474     cusparseTriFactors->cpermIndices->assign(c, c+n);
475   }
476 
477   if (!row_identity && !col_identity) {
478     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
479   } else if(!row_identity) {
480     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
481   } else if(!col_identity) {
482     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
483   }
484 
485   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
486   PetscFunctionReturn(0);
487 }
488 
489 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
490 {
491   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
492   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
493   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
494   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
495   cusparseStatus_t                  stat;
496   PetscErrorCode                    ierr;
497   cudaError_t                       cerr;
498   PetscInt                          *AiUp, *AjUp;
499   PetscScalar                       *AAUp;
500   PetscScalar                       *AALo;
501   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
502   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
503   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
504   const MatScalar                   *aa = b->a,*v;
505 
506   PetscFunctionBegin;
507   if (!n) PetscFunctionReturn(0);
508   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
509     try {
510       /* Allocate Space for the upper triangular matrix */
511       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
512       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
513       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
514       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
515 
516       /* Fill the upper triangular matrix */
517       AiUp[0]=(PetscInt) 0;
518       AiUp[n]=nzUpper;
519       offset = 0;
520       for (i=0; i<n; i++) {
521         /* set the pointers */
522         v  = aa + ai[i];
523         vj = aj + ai[i];
524         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
525 
526         /* first, set the diagonal elements */
527         AjUp[offset] = (PetscInt) i;
528         AAUp[offset] = (MatScalar)1.0/v[nz];
529         AiUp[i]      = offset;
530         AALo[offset] = (MatScalar)1.0/v[nz];
531 
532         offset+=1;
533         if (nz>0) {
534           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
535           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
536           for (j=offset; j<offset+nz; j++) {
537             AAUp[j] = -AAUp[j];
538             AALo[j] = AAUp[j]/v[nz];
539           }
540           offset+=nz;
541         }
542       }
543 
544       /* allocate space for the triangular factor information */
545       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
546 
547       /* Create the matrix description */
548       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
549       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
550       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
551       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
552       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
553 
554       /* Create the solve analysis information */
555       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
556 
557       /* set the operation */
558       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
559 
560       /* set the matrix */
561       upTriFactor->csrMat = new CsrMatrix;
562       upTriFactor->csrMat->num_rows = A->rmap->n;
563       upTriFactor->csrMat->num_cols = A->cmap->n;
564       upTriFactor->csrMat->num_entries = a->nz;
565 
566       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
567       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
568 
569       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
570       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
571 
572       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
573       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
574 
575       /* perform the solve analysis */
576       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
577                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
578                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
579                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
580 
581       /* assign the pointer. Is this really necessary? */
582       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
583 
584       /* allocate space for the triangular factor information */
585       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
586 
587       /* Create the matrix description */
588       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
589       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
590       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
591       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
592       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
593 
594       /* Create the solve analysis information */
595       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
596 
597       /* set the operation */
598       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
599 
600       /* set the matrix */
601       loTriFactor->csrMat = new CsrMatrix;
602       loTriFactor->csrMat->num_rows = A->rmap->n;
603       loTriFactor->csrMat->num_cols = A->cmap->n;
604       loTriFactor->csrMat->num_entries = a->nz;
605 
606       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
607       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
608 
609       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
610       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
611 
612       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
613       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
614       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
615 
616       /* perform the solve analysis */
617       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
618                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
619                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
620                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
621 
622       /* assign the pointer. Is this really necessary? */
623       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
624 
625       A->offloadmask = PETSC_OFFLOAD_BOTH;
626       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
629       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
630     } catch(char *ex) {
631       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
632     }
633   }
634   PetscFunctionReturn(0);
635 }
636 
637 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
638 {
639   PetscErrorCode               ierr;
640   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
641   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
642   IS                           ip = a->row;
643   const PetscInt               *rip;
644   PetscBool                    perm_identity;
645   PetscInt                     n = A->rmap->n;
646 
647   PetscFunctionBegin;
648   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
649   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
650   cusparseTriFactors->workVector = new THRUSTARRAY(n);
651   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
652 
653   /* lower triangular indices */
654   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
655   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
656   if (!perm_identity) {
657     IS             iip;
658     const PetscInt *irip;
659 
660     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
661     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
662     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
664     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
665     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
666     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
667     ierr = ISDestroy(&iip);CHKERRQ(ierr);
668     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
669   }
670   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
675 {
676   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
677   IS             isrow = b->row,iscol = b->col;
678   PetscBool      row_identity,col_identity;
679   PetscErrorCode ierr;
680 
681   PetscFunctionBegin;
682   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
683   B->offloadmask = PETSC_OFFLOAD_CPU;
684   /* determine which version of MatSolve needs to be used. */
685   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
686   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687   if (row_identity && col_identity) {
688     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
689     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
690     B->ops->matsolve = NULL;
691     B->ops->matsolvetranspose = NULL;
692   } else {
693     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
694     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
695     B->ops->matsolve = NULL;
696     B->ops->matsolvetranspose = NULL;
697   }
698 
699   /* get the triangular factors */
700   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
701   PetscFunctionReturn(0);
702 }
703 
704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
705 {
706   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
707   IS             ip = b->row;
708   PetscBool      perm_identity;
709   PetscErrorCode ierr;
710 
711   PetscFunctionBegin;
712   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
713   B->offloadmask = PETSC_OFFLOAD_CPU;
714   /* determine which version of MatSolve needs to be used. */
715   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
716   if (perm_identity) {
717     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
718     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
719     B->ops->matsolve = NULL;
720     B->ops->matsolvetranspose = NULL;
721   } else {
722     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
723     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
724     B->ops->matsolve = NULL;
725     B->ops->matsolvetranspose = NULL;
726   }
727 
728   /* get the triangular factors */
729   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
734 {
735   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
736   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
737   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
738   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
739   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
740   cusparseStatus_t                  stat;
741   cusparseIndexBase_t               indexBase;
742   cusparseMatrixType_t              matrixType;
743   cusparseFillMode_t                fillMode;
744   cusparseDiagType_t                diagType;
745 
746   PetscFunctionBegin;
747 
748   /*********************************************/
749   /* Now the Transpose of the Lower Tri Factor */
750   /*********************************************/
751 
752   /* allocate space for the transpose of the lower triangular factor */
753   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
754 
755   /* set the matrix descriptors of the lower triangular factor */
756   matrixType = cusparseGetMatType(loTriFactor->descr);
757   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
758   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
759     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
760   diagType = cusparseGetMatDiagType(loTriFactor->descr);
761 
762   /* Create the matrix description */
763   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
764   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
765   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
766   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
767   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
768 
769   /* Create the solve analysis information */
770   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
771 
772   /* set the operation */
773   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
774 
775   /* allocate GPU space for the CSC of the lower triangular factor*/
776   loTriFactorT->csrMat = new CsrMatrix;
777   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
778   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
779   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
780   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
781   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
782   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
783 
784   /* compute the transpose of the lower triangular factor, i.e. the CSC */
785   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
786                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
787                           loTriFactor->csrMat->values->data().get(),
788                           loTriFactor->csrMat->row_offsets->data().get(),
789                           loTriFactor->csrMat->column_indices->data().get(),
790                           loTriFactorT->csrMat->values->data().get(),
791                           loTriFactorT->csrMat->column_indices->data().get(),
792                           loTriFactorT->csrMat->row_offsets->data().get(),
793                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
794 
795   /* perform the solve analysis on the transposed matrix */
796   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
797                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
798                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
799                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
800                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
801 
802   /* assign the pointer. Is this really necessary? */
803   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
804 
805   /*********************************************/
806   /* Now the Transpose of the Upper Tri Factor */
807   /*********************************************/
808 
809   /* allocate space for the transpose of the upper triangular factor */
810   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
811 
812   /* set the matrix descriptors of the upper triangular factor */
813   matrixType = cusparseGetMatType(upTriFactor->descr);
814   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
815   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
816     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
817   diagType = cusparseGetMatDiagType(upTriFactor->descr);
818 
819   /* Create the matrix description */
820   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
821   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
822   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
823   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
824   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
825 
826   /* Create the solve analysis information */
827   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
828 
829   /* set the operation */
830   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
831 
832   /* allocate GPU space for the CSC of the upper triangular factor*/
833   upTriFactorT->csrMat = new CsrMatrix;
834   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
835   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
836   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
837   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
838   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
839   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
840 
841   /* compute the transpose of the upper triangular factor, i.e. the CSC */
842   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
843                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
844                           upTriFactor->csrMat->values->data().get(),
845                           upTriFactor->csrMat->row_offsets->data().get(),
846                           upTriFactor->csrMat->column_indices->data().get(),
847                           upTriFactorT->csrMat->values->data().get(),
848                           upTriFactorT->csrMat->column_indices->data().get(),
849                           upTriFactorT->csrMat->row_offsets->data().get(),
850                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
851 
852   /* perform the solve analysis on the transposed matrix */
853   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
854                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
855                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
856                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
857                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
858 
859   /* assign the pointer. Is this really necessary? */
860   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
861   PetscFunctionReturn(0);
862 }
863 
864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
865 {
866   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
867   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
868   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
869   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
870   cusparseStatus_t             stat;
871   cusparseIndexBase_t          indexBase;
872   cudaError_t                  err;
873   PetscErrorCode               ierr;
874 
875   PetscFunctionBegin;
876 
877   /* allocate space for the triangular factor information */
878   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
879   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
880   indexBase = cusparseGetMatIndexBase(matstruct->descr);
881   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
882   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
883 
884   /* set alpha and beta */
885   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
886   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
887   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
888   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
889   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
890   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
891   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
892 
893   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
894     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
895     CsrMatrix *matrixT= new CsrMatrix;
896     matrixT->num_rows = A->cmap->n;
897     matrixT->num_cols = A->rmap->n;
898     matrixT->num_entries = a->nz;
899     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
900     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
901     matrixT->values = new THRUSTARRAY(a->nz);
902 
903     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
904     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
905     /* compute the transpose, i.e. the CSC */
906     indexBase = cusparseGetMatIndexBase(matstruct->descr);
907     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
908                             A->cmap->n, matrix->num_entries,
909                             matrix->values->data().get(),
910                             cusparsestruct->rowoffsets_gpu->data().get(),
911                             matrix->column_indices->data().get(),
912                             matrixT->values->data().get(),
913                             matrixT->column_indices->data().get(),
914                             matrixT->row_offsets->data().get(),
915                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
916     /* assign the pointer */
917     matstructT->mat = matrixT;
918     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
919   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
920     /* First convert HYB to CSR */
921     CsrMatrix *temp= new CsrMatrix;
922     temp->num_rows = A->rmap->n;
923     temp->num_cols = A->cmap->n;
924     temp->num_entries = a->nz;
925     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
926     temp->column_indices = new THRUSTINTARRAY32(a->nz);
927     temp->values = new THRUSTARRAY(a->nz);
928 
929 
930     stat = cusparse_hyb2csr(cusparsestruct->handle,
931                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
932                             temp->values->data().get(),
933                             temp->row_offsets->data().get(),
934                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
935 
936     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
937     CsrMatrix *tempT= new CsrMatrix;
938     tempT->num_rows = A->rmap->n;
939     tempT->num_cols = A->cmap->n;
940     tempT->num_entries = a->nz;
941     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
942     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
943     tempT->values = new THRUSTARRAY(a->nz);
944 
945     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
946                             temp->num_cols, temp->num_entries,
947                             temp->values->data().get(),
948                             temp->row_offsets->data().get(),
949                             temp->column_indices->data().get(),
950                             tempT->values->data().get(),
951                             tempT->column_indices->data().get(),
952                             tempT->row_offsets->data().get(),
953                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
954 
955     /* Last, convert CSC to HYB */
956     cusparseHybMat_t hybMat;
957     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
958     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
959       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
960     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
961                             matstructT->descr, tempT->values->data().get(),
962                             tempT->row_offsets->data().get(),
963                             tempT->column_indices->data().get(),
964                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
965 
966     /* assign the pointer */
967     matstructT->mat = hybMat;
968     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
969 
970     /* delete temporaries */
971     if (tempT) {
972       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
973       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
974       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
975       delete (CsrMatrix*) tempT;
976     }
977     if (temp) {
978       if (temp->values) delete (THRUSTARRAY*) temp->values;
979       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
980       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
981       delete (CsrMatrix*) temp;
982     }
983   }
984   /* the compressed row indices is not used for matTranspose */
985   matstructT->cprowIndices = NULL;
986   /* assign the pointer */
987   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
988   PetscFunctionReturn(0);
989 }
990 
991 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
992 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
993 {
994   PetscInt                              n = xx->map->n;
995   const PetscScalar                     *barray;
996   PetscScalar                           *xarray;
997   thrust::device_ptr<const PetscScalar> bGPU;
998   thrust::device_ptr<PetscScalar>       xGPU;
999   cusparseStatus_t                      stat;
1000   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1002   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1003   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1004   PetscErrorCode                        ierr;
1005   cudaError_t                           cerr;
1006 
1007   PetscFunctionBegin;
1008   /* Analyze the matrix and create the transpose ... on the fly */
1009   if (!loTriFactorT && !upTriFactorT) {
1010     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1011     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1012     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1013   }
1014 
1015   /* Get the GPU pointers */
1016   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1017   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1018   xGPU = thrust::device_pointer_cast(xarray);
1019   bGPU = thrust::device_pointer_cast(barray);
1020 
1021   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1022   /* First, reorder with the row permutation */
1023   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1024                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1025                xGPU);
1026 
1027   /* First, solve U */
1028   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1029                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1030                         upTriFactorT->csrMat->values->data().get(),
1031                         upTriFactorT->csrMat->row_offsets->data().get(),
1032                         upTriFactorT->csrMat->column_indices->data().get(),
1033                         upTriFactorT->solveInfo,
1034                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1035 
1036   /* Then, solve L */
1037   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1038                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1039                         loTriFactorT->csrMat->values->data().get(),
1040                         loTriFactorT->csrMat->row_offsets->data().get(),
1041                         loTriFactorT->csrMat->column_indices->data().get(),
1042                         loTriFactorT->solveInfo,
1043                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1044 
1045   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1046   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1047                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1048                tempGPU->begin());
1049 
1050   /* Copy the temporary to the full solution. */
1051   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1052 
1053   /* restore */
1054   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1055   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1056   cerr = WaitForGPU();CHKERRCUDA(cerr);
1057   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1058   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1063 {
1064   const PetscScalar                 *barray;
1065   PetscScalar                       *xarray;
1066   cusparseStatus_t                  stat;
1067   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1068   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1070   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1071   PetscErrorCode                    ierr;
1072   cudaError_t                       cerr;
1073 
1074   PetscFunctionBegin;
1075   /* Analyze the matrix and create the transpose ... on the fly */
1076   if (!loTriFactorT && !upTriFactorT) {
1077     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1078     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080   }
1081 
1082   /* Get the GPU pointers */
1083   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1084   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1085 
1086   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1087   /* First, solve U */
1088   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1089                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1090                         upTriFactorT->csrMat->values->data().get(),
1091                         upTriFactorT->csrMat->row_offsets->data().get(),
1092                         upTriFactorT->csrMat->column_indices->data().get(),
1093                         upTriFactorT->solveInfo,
1094                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1095 
1096   /* Then, solve L */
1097   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1098                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1099                         loTriFactorT->csrMat->values->data().get(),
1100                         loTriFactorT->csrMat->row_offsets->data().get(),
1101                         loTriFactorT->csrMat->column_indices->data().get(),
1102                         loTriFactorT->solveInfo,
1103                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1104 
1105   /* restore */
1106   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1107   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1108   cerr = WaitForGPU();CHKERRCUDA(cerr);
1109   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1110   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1115 {
1116   const PetscScalar                     *barray;
1117   PetscScalar                           *xarray;
1118   thrust::device_ptr<const PetscScalar> bGPU;
1119   thrust::device_ptr<PetscScalar>       xGPU;
1120   cusparseStatus_t                      stat;
1121   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1122   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1123   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1124   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1125   PetscErrorCode                        ierr;
1126   cudaError_t                           cerr;
1127 
1128   PetscFunctionBegin;
1129 
1130   /* Get the GPU pointers */
1131   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1132   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1133   xGPU = thrust::device_pointer_cast(xarray);
1134   bGPU = thrust::device_pointer_cast(barray);
1135 
1136   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1137   /* First, reorder with the row permutation */
1138   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1139                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1140                tempGPU->begin());
1141 
1142   /* Next, solve L */
1143   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1144                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1145                         loTriFactor->csrMat->values->data().get(),
1146                         loTriFactor->csrMat->row_offsets->data().get(),
1147                         loTriFactor->csrMat->column_indices->data().get(),
1148                         loTriFactor->solveInfo,
1149                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1150 
1151   /* Then, solve U */
1152   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1153                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1154                         upTriFactor->csrMat->values->data().get(),
1155                         upTriFactor->csrMat->row_offsets->data().get(),
1156                         upTriFactor->csrMat->column_indices->data().get(),
1157                         upTriFactor->solveInfo,
1158                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1159 
1160   /* Last, reorder with the column permutation */
1161   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1162                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1163                xGPU);
1164 
1165   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1166   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1167   cerr = WaitForGPU();CHKERRCUDA(cerr);
1168   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1169   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1174 {
1175   const PetscScalar                 *barray;
1176   PetscScalar                       *xarray;
1177   cusparseStatus_t                  stat;
1178   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1179   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1180   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1181   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1182   PetscErrorCode                    ierr;
1183   cudaError_t                       cerr;
1184 
1185   PetscFunctionBegin;
1186   /* Get the GPU pointers */
1187   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1188   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1189 
1190   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1191   /* First, solve L */
1192   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1193                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1194                         loTriFactor->csrMat->values->data().get(),
1195                         loTriFactor->csrMat->row_offsets->data().get(),
1196                         loTriFactor->csrMat->column_indices->data().get(),
1197                         loTriFactor->solveInfo,
1198                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1199 
1200   /* Next, solve U */
1201   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1202                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1203                         upTriFactor->csrMat->values->data().get(),
1204                         upTriFactor->csrMat->row_offsets->data().get(),
1205                         upTriFactor->csrMat->column_indices->data().get(),
1206                         upTriFactor->solveInfo,
1207                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1208 
1209   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1210   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1211   cerr = WaitForGPU();CHKERRCUDA(cerr);
1212   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1213   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1218 {
1219   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1220   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1221   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1222   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1223   PetscErrorCode               ierr;
1224   cusparseStatus_t             stat;
1225   cudaError_t                  err;
1226 
1227   PetscFunctionBegin;
1228   if (A->boundtocpu) PetscFunctionReturn(0);
1229   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1230     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1231     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232       /* Copy values only */
1233       CsrMatrix *mat,*matT;
1234       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1235       mat->values->assign(a->a, a->a+a->nz);
1236       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1237 
1238       /* Update matT when it was built before */
1239       if (cusparsestruct->matTranspose) {
1240         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1241         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1242         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1243                                  A->cmap->n, mat->num_entries,
1244                                  mat->values->data().get(),
1245                                  cusparsestruct->rowoffsets_gpu->data().get(),
1246                                  mat->column_indices->data().get(),
1247                                  matT->values->data().get(),
1248                                  matT->column_indices->data().get(),
1249                                  matT->row_offsets->data().get(),
1250                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1251       }
1252     } else {
1253       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1254       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1255       delete cusparsestruct->workVector;
1256       delete cusparsestruct->rowoffsets_gpu;
1257       try {
1258         if (a->compressedrow.use) {
1259           m    = a->compressedrow.nrows;
1260           ii   = a->compressedrow.i;
1261           ridx = a->compressedrow.rindex;
1262         } else {
1263           m    = A->rmap->n;
1264           ii   = a->i;
1265           ridx = NULL; /* In this case, one should not dereference ridx */
1266         }
1267         cusparsestruct->nrows = m;
1268 
1269         /* allocate space for the triangular factor information */
1270         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1271         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1272         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1273         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1274 
1275         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1276         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1277         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1278         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1279         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1280         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1281         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1282 
1283         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1284         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1285           /* set the matrix */
1286           CsrMatrix *matrix= new CsrMatrix;
1287           matrix->num_rows = m;
1288           matrix->num_cols = A->cmap->n;
1289           matrix->num_entries = a->nz;
1290           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1291           matrix->row_offsets->assign(ii, ii + m+1);
1292 
1293           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1294           matrix->column_indices->assign(a->j, a->j+a->nz);
1295 
1296           matrix->values = new THRUSTARRAY(a->nz);
1297           matrix->values->assign(a->a, a->a+a->nz);
1298 
1299           /* assign the pointer */
1300           matstruct->mat = matrix;
1301 
1302         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1303           CsrMatrix *matrix= new CsrMatrix;
1304           matrix->num_rows = m;
1305           matrix->num_cols = A->cmap->n;
1306           matrix->num_entries = a->nz;
1307           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1308           matrix->row_offsets->assign(ii, ii + m+1);
1309 
1310           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1311           matrix->column_indices->assign(a->j, a->j+a->nz);
1312 
1313           matrix->values = new THRUSTARRAY(a->nz);
1314           matrix->values->assign(a->a, a->a+a->nz);
1315 
1316           cusparseHybMat_t hybMat;
1317           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1318           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1319             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1320           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1321               matstruct->descr, matrix->values->data().get(),
1322               matrix->row_offsets->data().get(),
1323               matrix->column_indices->data().get(),
1324               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1325           /* assign the pointer */
1326           matstruct->mat = hybMat;
1327 
1328           if (matrix) {
1329             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1330             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1331             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1332             delete (CsrMatrix*)matrix;
1333           }
1334         }
1335 
1336         /* assign the compressed row indices */
1337         if (a->compressedrow.use) {
1338           cusparsestruct->workVector = new THRUSTARRAY(m);
1339           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1340           matstruct->cprowIndices->assign(ridx,ridx+m);
1341           tmp = m;
1342         } else {
1343           cusparsestruct->workVector = NULL;
1344           matstruct->cprowIndices    = NULL;
1345           tmp = 0;
1346         }
1347         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1348 
1349         /* assign the pointer */
1350         cusparsestruct->mat = matstruct;
1351       } catch(char *ex) {
1352         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1353       }
1354       cusparsestruct->nonzerostate = A->nonzerostate;
1355     }
1356     err  = WaitForGPU();CHKERRCUDA(err);
1357     A->offloadmask = PETSC_OFFLOAD_BOTH;
1358     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1359   }
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 struct VecCUDAPlusEquals
1364 {
1365   template <typename Tuple>
1366   __host__ __device__
1367   void operator()(Tuple t)
1368   {
1369     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1370   }
1371 };
1372 
1373 typedef struct {
1374   PetscBool   cisdense;
1375   PetscScalar *Bt;
1376   Mat         X;
1377 } MatMatCusparse;
1378 
1379 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1380 {
1381   PetscErrorCode ierr;
1382   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1383   cudaError_t    cerr;
1384 
1385   PetscFunctionBegin;
1386   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1387   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1388   ierr = PetscFree(data);CHKERRQ(ierr);
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1393 
1394 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1395 {
1396   Mat_Product                  *product = C->product;
1397   Mat                          A,B;
1398   PetscInt                     m,n,k,blda,clda;
1399   PetscBool                    flg,biscuda;
1400   Mat_SeqAIJCUSPARSE           *cusp;
1401   cusparseStatus_t             stat;
1402   cusparseOperation_t          opA;
1403   cusparseMatDescr_t           matA;
1404   const PetscScalar            *barray;
1405   PetscScalar                  *carray;
1406   PetscErrorCode               ierr;
1407   MatMatCusparse               *mmdata;
1408   Mat_SeqAIJCUSPARSEMultStruct *mat;
1409   CsrMatrix                    *csrmat;
1410   cudaError_t                  cuer;
1411 
1412   PetscFunctionBegin;
1413   MatCheckProduct(C,1);
1414   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1415   mmdata = (MatMatCusparse*)product->data;
1416   A    = product->A;
1417   B    = product->B;
1418   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1419   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1420   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1421      Instead of silently accepting the wrong answer, I prefer to raise the error */
1422   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1423   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1424   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1425   switch (product->type) {
1426   case MATPRODUCT_AB:
1427   case MATPRODUCT_PtAP:
1428     mat = cusp->mat;
1429     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1430     m   = A->rmap->n;
1431     k   = A->cmap->n;
1432     n   = B->cmap->n;
1433     break;
1434   case MATPRODUCT_AtB:
1435     if (!cusp->matTranspose) {
1436       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1437     }
1438     mat = cusp->matTranspose;
1439     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1440     m   = A->cmap->n;
1441     k   = A->rmap->n;
1442     n   = B->cmap->n;
1443     break;
1444   case MATPRODUCT_ABt:
1445   case MATPRODUCT_RARt:
1446     mat = cusp->mat;
1447     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1448     m   = A->rmap->n;
1449     k   = B->cmap->n;
1450     n   = B->rmap->n;
1451     break;
1452   default:
1453     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1454   }
1455   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1456   matA   = mat->descr;
1457   csrmat = (CsrMatrix*)mat->mat;
1458   /* if the user passed a CPU matrix, copy the data to the GPU */
1459   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1460   if (!biscuda) {
1461     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1462   }
1463   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1464   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1465   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1466     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1467     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1468   } else {
1469     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1470     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1471   }
1472 
1473   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1474 
1475   /* cusparse spmm does not support transpose on B */
1476   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1477     cublasHandle_t cublasv2handle;
1478     cublasStatus_t cerr;
1479 
1480     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1481     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1482                        B->cmap->n,B->rmap->n,
1483                        &PETSC_CUSPARSE_ONE ,barray,blda,
1484                        &PETSC_CUSPARSE_ZERO,barray,blda,
1485                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1486     blda = B->cmap->n;
1487   }
1488 
1489   /* perform the MatMat operation */
1490   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1491                            csrmat->num_entries,mat->alpha,matA,
1492                            csrmat->values->data().get(),
1493                            csrmat->row_offsets->data().get(),
1494                            csrmat->column_indices->data().get(),
1495                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1496                            carray,clda);CHKERRCUSPARSE(stat);
1497   cuer = WaitForGPU();CHKERRCUDA(cuer);
1498   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1499   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1500   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1501   if (product->type == MATPRODUCT_RARt) {
1502     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1503     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1504   } else if (product->type == MATPRODUCT_PtAP) {
1505     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1506     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1507   } else {
1508     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1509   }
1510   if (mmdata->cisdense) {
1511     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1512   }
1513   if (!biscuda) {
1514     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1515   }
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1520 {
1521   Mat_Product        *product = C->product;
1522   Mat                A,B;
1523   PetscInt           m,n;
1524   PetscBool          cisdense,flg;
1525   PetscErrorCode     ierr;
1526   MatMatCusparse     *mmdata;
1527   Mat_SeqAIJCUSPARSE *cusp;
1528 
1529   PetscFunctionBegin;
1530   MatCheckProduct(C,1);
1531   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1532   A    = product->A;
1533   B    = product->B;
1534   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1535   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1536   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1537   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1538   switch (product->type) {
1539   case MATPRODUCT_AB:
1540     m = A->rmap->n;
1541     n = B->cmap->n;
1542     break;
1543   case MATPRODUCT_AtB:
1544     m = A->cmap->n;
1545     n = B->cmap->n;
1546     break;
1547   case MATPRODUCT_ABt:
1548     m = A->rmap->n;
1549     n = B->rmap->n;
1550     break;
1551   case MATPRODUCT_PtAP:
1552     m = B->cmap->n;
1553     n = B->cmap->n;
1554     break;
1555   case MATPRODUCT_RARt:
1556     m = B->rmap->n;
1557     n = B->rmap->n;
1558     break;
1559   default:
1560     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1561   }
1562   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1563   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1564   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1565   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1566 
1567   /* product data */
1568   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1569   mmdata->cisdense = cisdense;
1570   /* cusparse spmm does not support transpose on B */
1571   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1572     cudaError_t cerr;
1573 
1574     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1575   }
1576   /* for these products we need intermediate storage */
1577   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1578     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1579     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1580     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1581       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1582     } else {
1583       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1584     }
1585   }
1586   C->product->data    = mmdata;
1587   C->product->destroy = MatDestroy_MatMatCusparse;
1588 
1589   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1594 
1595 /* handles dense B */
1596 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1597 {
1598   Mat_Product    *product = C->product;
1599   PetscErrorCode ierr;
1600 
1601   PetscFunctionBegin;
1602   MatCheckProduct(C,1);
1603   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1604   if (product->A->boundtocpu) {
1605     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1606     PetscFunctionReturn(0);
1607   }
1608   switch (product->type) {
1609   case MATPRODUCT_AB:
1610   case MATPRODUCT_AtB:
1611   case MATPRODUCT_ABt:
1612   case MATPRODUCT_PtAP:
1613   case MATPRODUCT_RARt:
1614     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1615   default:
1616     break;
1617   }
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1622 {
1623   PetscErrorCode ierr;
1624 
1625   PetscFunctionBegin;
1626   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1631 {
1632   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1633   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1634   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1635   const PetscScalar            *xarray;
1636   PetscScalar                  *yarray;
1637   PetscErrorCode               ierr;
1638   cudaError_t                  cerr;
1639   cusparseStatus_t             stat;
1640 
1641   PetscFunctionBegin;
1642   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1643   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1644   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1645   if (!matstructT) {
1646     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1647     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1648   }
1649   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1650   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1651   if (yy->map->n) {
1652     PetscInt                     n = yy->map->n;
1653     cudaError_t                  err;
1654     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1655   }
1656 
1657   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1658   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1659     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1660     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1661                              mat->num_rows, mat->num_cols,
1662                              mat->num_entries, matstructT->alpha, matstructT->descr,
1663                              mat->values->data().get(), mat->row_offsets->data().get(),
1664                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1665                              yarray);CHKERRCUSPARSE(stat);
1666   } else {
1667     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1668     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1669                              matstructT->alpha, matstructT->descr, hybMat,
1670                              xarray, matstructT->beta_zero,
1671                              yarray);CHKERRCUSPARSE(stat);
1672   }
1673   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1674   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1675   cerr = WaitForGPU();CHKERRCUDA(cerr);
1676   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1677   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 
1682 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1683 {
1684   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1685   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1686   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1687   const PetscScalar            *xarray;
1688   PetscScalar                  *zarray,*dptr,*beta;
1689   PetscErrorCode               ierr;
1690   cudaError_t                  cerr;
1691   cusparseStatus_t             stat;
1692   PetscBool                    compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1693 
1694   PetscFunctionBegin;
1695   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1696   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1697   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1698   try {
1699     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1700 
1701     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1702     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1703 
1704     dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */
1705     beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1706 
1707     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1708     /* csr_spmv does y = alpha*Ax + beta*y */
1709     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1710       /* here we need to be careful to set the number of rows in the multiply to the
1711          number of compressed rows in the matrix ... which is equivalent to the
1712          size of the workVector */
1713       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1714       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1715                                mat->num_rows, mat->num_cols,
1716                                mat->num_entries, matstruct->alpha, matstruct->descr,
1717                                mat->values->data().get(), mat->row_offsets->data().get(),
1718                                mat->column_indices->data().get(), xarray, beta,
1719                                dptr);CHKERRCUSPARSE(stat);
1720     } else {
1721       if (cusparsestruct->nrows) {
1722         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1723         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1724                                  matstruct->alpha, matstruct->descr, hybMat,
1725                                  xarray, beta,
1726                                  dptr);CHKERRCUSPARSE(stat);
1727       }
1728     }
1729     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1730 
1731     if (yy) { /* MatMultAdd: zz = A*xx + yy */
1732       if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1733         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1734       } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */
1735         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1736       }
1737     } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1738       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1739     }
1740 
1741     /* ScatterAdd the result from work vector into the full vector when A is compressed */
1742     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1743     if (compressed) {
1744       thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1745       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1746                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1747                        VecCUDAPlusEquals());
1748     }
1749     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1750     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1751     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1752     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1753   } catch(char *ex) {
1754     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1755   }
1756   cerr = WaitForGPU();CHKERRCUDA(cerr);
1757   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1758   PetscFunctionReturn(0);
1759 }
1760 
1761 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1762 {
1763   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1764   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1765   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1766   const PetscScalar               *xarray;
1767   PetscScalar                     *zarray,*beta;
1768   PetscErrorCode                  ierr;
1769   cudaError_t                     cerr;
1770   cusparseStatus_t                stat;
1771 
1772   PetscFunctionBegin;
1773   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1774   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1775   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1776   if (!matstructT) {
1777     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1778     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1779   }
1780 
1781   /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1782   try {
1783     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1784     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1785     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1786     beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;
1787 
1788     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1789     /* multiply add with matrix transpose */
1790     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1791       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1792       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1793                                mat->num_rows, mat->num_cols,
1794                                mat->num_entries, matstructT->alpha, matstructT->descr,
1795                                mat->values->data().get(), mat->row_offsets->data().get(),
1796                                mat->column_indices->data().get(), xarray, beta,
1797                                zarray);CHKERRCUSPARSE(stat);
1798     } else {
1799       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1800       if (cusparsestruct->nrows) {
1801         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1802                                  matstructT->alpha, matstructT->descr, hybMat,
1803                                  xarray, beta,
1804                                  zarray);CHKERRCUSPARSE(stat);
1805       }
1806     }
1807     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1808 
1809     if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);}
1810     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1811     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1812   } catch(char *ex) {
1813     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1814   }
1815   cerr = WaitForGPU();CHKERRCUDA(cerr);
1816   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1817   PetscFunctionReturn(0);
1818 }
1819 
1820 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1821 {
1822   PetscErrorCode ierr;
1823 
1824   PetscFunctionBegin;
1825   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1826   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1827   if (A->factortype == MAT_FACTOR_NONE) {
1828     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1829   }
1830   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1831   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1832   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1833   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1834   PetscFunctionReturn(0);
1835 }
1836 
1837 /* --------------------------------------------------------------------------------*/
1838 /*@
1839    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1840    (the default parallel PETSc format). This matrix will ultimately pushed down
1841    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1842    assembly performance the user should preallocate the matrix storage by setting
1843    the parameter nz (or the array nnz).  By setting these parameters accurately,
1844    performance during matrix assembly can be increased by more than a factor of 50.
1845 
1846    Collective
1847 
1848    Input Parameters:
1849 +  comm - MPI communicator, set to PETSC_COMM_SELF
1850 .  m - number of rows
1851 .  n - number of columns
1852 .  nz - number of nonzeros per row (same for all rows)
1853 -  nnz - array containing the number of nonzeros in the various rows
1854          (possibly different for each row) or NULL
1855 
1856    Output Parameter:
1857 .  A - the matrix
1858 
1859    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1860    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1861    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1862 
1863    Notes:
1864    If nnz is given then nz is ignored
1865 
1866    The AIJ format (also called the Yale sparse matrix format or
1867    compressed row storage), is fully compatible with standard Fortran 77
1868    storage.  That is, the stored row and column indices can begin at
1869    either one (as in Fortran) or zero.  See the users' manual for details.
1870 
1871    Specify the preallocated storage with either nz or nnz (not both).
1872    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1873    allocation.  For large problems you MUST preallocate memory or you
1874    will get TERRIBLE performance, see the users' manual chapter on matrices.
1875 
1876    By default, this format uses inodes (identical nodes) when possible, to
1877    improve numerical efficiency of matrix-vector products and solves. We
1878    search for consecutive rows with the same nonzero structure, thereby
1879    reusing matrix information to achieve increased efficiency.
1880 
1881    Level: intermediate
1882 
1883 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1884 @*/
1885 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1886 {
1887   PetscErrorCode ierr;
1888 
1889   PetscFunctionBegin;
1890   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1891   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1892   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1893   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1894   PetscFunctionReturn(0);
1895 }
1896 
1897 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1898 {
1899   PetscErrorCode ierr;
1900 
1901   PetscFunctionBegin;
1902   if (A->factortype == MAT_FACTOR_NONE) {
1903     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1904       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1905     }
1906   } else {
1907     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1908   }
1909   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1910   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1911   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1912   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1913   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1914   PetscFunctionReturn(0);
1915 }
1916 
1917 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1918 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1919 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1920 {
1921   PetscErrorCode ierr;
1922 
1923   PetscFunctionBegin;
1924   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1925   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1926   PetscFunctionReturn(0);
1927 }
1928 
1929 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1930 {
1931   PetscFunctionBegin;
1932   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1933      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1934      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1935      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1936            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1937   /* We need to take care of function composition also */
1938   if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
1939   if (flg) {
1940     A->ops->mult             = MatMult_SeqAIJ;
1941     A->ops->multadd          = MatMultAdd_SeqAIJ;
1942     A->ops->multtranspose    = MatMultTranspose_SeqAIJ;
1943     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1944     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
1945     A->ops->duplicate        = MatDuplicate_SeqAIJ;
1946   } else {
1947     A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1948     A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1949     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1950     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1951     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1952     A->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1953     A->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1954   }
1955   A->boundtocpu = flg;
1956   PetscFunctionReturn(0);
1957 }
1958 
1959 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1960 {
1961   PetscErrorCode   ierr;
1962   cusparseStatus_t stat;
1963   cusparseHandle_t handle=0;
1964   Mat              B;
1965 
1966   PetscFunctionBegin;
1967   if (reuse == MAT_INITIAL_MATRIX) {
1968     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
1969   } else if (reuse == MAT_REUSE_MATRIX) {
1970     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
1971   }
1972   B = *newmat;
1973 
1974   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1975   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1976 
1977   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
1978     if (B->factortype == MAT_FACTOR_NONE) {
1979       /* you cannot check the inode.use flag here since the matrix was just created.
1980          now build a GPU matrix data structure */
1981       B->spptr = new Mat_SeqAIJCUSPARSE;
1982       ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1983       ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1984       ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1985       ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1986       ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1987       ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1988       stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1989       ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1990       ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1991     } else {
1992       /* NEXT, set the pointers to the triangular factors */
1993       B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1994       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1995       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1996       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1997       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1998       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1999       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
2000       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
2001       stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
2002       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
2003       ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
2004     }
2005   }
2006   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
2007   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
2008   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
2009   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
2010   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
2011   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
2012   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
2013   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
2014   B->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
2015 
2016   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2017 
2018   B->boundtocpu  = PETSC_FALSE;
2019   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2020 
2021   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2022   ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2023   ierr = PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2024   PetscFunctionReturn(0);
2025 }
2026 
2027 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2028 {
2029   PetscErrorCode ierr;
2030 
2031   PetscFunctionBegin;
2032   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2033   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2034   PetscFunctionReturn(0);
2035 }
2036 
2037 /*MC
2038    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2039 
2040    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2041    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2042    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2043 
2044    Options Database Keys:
2045 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2046 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2047 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2048 
2049   Level: beginner
2050 
2051 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2052 M*/
2053 
2054 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2055 
2056 
2057 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2058 {
2059   PetscErrorCode ierr;
2060 
2061   PetscFunctionBegin;
2062   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2063   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2064   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2065   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2066   PetscFunctionReturn(0);
2067 }
2068 
2069 
2070 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2071 {
2072   cusparseStatus_t stat;
2073   cusparseHandle_t handle;
2074 
2075   PetscFunctionBegin;
2076   if (*cusparsestruct) {
2077     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2078     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2079     delete (*cusparsestruct)->workVector;
2080     delete (*cusparsestruct)->rowoffsets_gpu;
2081     if (handle = (*cusparsestruct)->handle) {
2082       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2083     }
2084     delete *cusparsestruct;
2085     *cusparsestruct = 0;
2086   }
2087   PetscFunctionReturn(0);
2088 }
2089 
2090 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2091 {
2092   PetscFunctionBegin;
2093   if (*mat) {
2094     delete (*mat)->values;
2095     delete (*mat)->column_indices;
2096     delete (*mat)->row_offsets;
2097     delete *mat;
2098     *mat = 0;
2099   }
2100   PetscFunctionReturn(0);
2101 }
2102 
2103 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2104 {
2105   cusparseStatus_t stat;
2106   PetscErrorCode   ierr;
2107 
2108   PetscFunctionBegin;
2109   if (*trifactor) {
2110     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2111     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2112     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2113     delete *trifactor;
2114     *trifactor = 0;
2115   }
2116   PetscFunctionReturn(0);
2117 }
2118 
2119 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2120 {
2121   CsrMatrix        *mat;
2122   cusparseStatus_t stat;
2123   cudaError_t      err;
2124 
2125   PetscFunctionBegin;
2126   if (*matstruct) {
2127     if ((*matstruct)->mat) {
2128       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2129         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2130         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2131       } else {
2132         mat = (CsrMatrix*)(*matstruct)->mat;
2133         CsrMatrix_Destroy(&mat);
2134       }
2135     }
2136     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2137     delete (*matstruct)->cprowIndices;
2138     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2139     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2140     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2141     delete *matstruct;
2142     *matstruct = 0;
2143   }
2144   PetscFunctionReturn(0);
2145 }
2146 
2147 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2148 {
2149   PetscFunctionBegin;
2150   if (*trifactors) {
2151     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2152     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2153     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2154     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2155     delete (*trifactors)->rpermIndices;
2156     delete (*trifactors)->cpermIndices;
2157     delete (*trifactors)->workVector;
2158     (*trifactors)->rpermIndices = 0;
2159     (*trifactors)->cpermIndices = 0;
2160     (*trifactors)->workVector = 0;
2161   }
2162   PetscFunctionReturn(0);
2163 }
2164 
2165 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2166 {
2167   cusparseHandle_t handle;
2168   cusparseStatus_t stat;
2169 
2170   PetscFunctionBegin;
2171   if (*trifactors) {
2172     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2173     if (handle = (*trifactors)->handle) {
2174       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2175     }
2176     delete *trifactors;
2177     *trifactors = 0;
2178   }
2179   PetscFunctionReturn(0);
2180 }
2181 
2182