xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 5a856986583887c326abe5dfd149e8184a29cd80)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189   if (flg) {
190     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 
195 }
196 
197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
223   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
233   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240   PetscInt                          n = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t                  stat;
244   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245   const MatScalar                   *aa = a->a,*v;
246   PetscInt                          *AiLo, *AjLo;
247   PetscScalar                       *AALo;
248   PetscInt                          i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode                    ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
332     } catch(char *ex) {
333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334     }
335   }
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340 {
341   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342   PetscInt                          n = A->rmap->n;
343   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345   cusparseStatus_t                  stat;
346   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347   const MatScalar                   *aa = a->a,*v;
348   PetscInt                          *AiUp, *AjUp;
349   PetscScalar                       *AAUp;
350   PetscInt                          i,nz, nzUpper, offset;
351   PetscErrorCode                    ierr;
352 
353   PetscFunctionBegin;
354   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
355     try {
356       /* next, figure out the number of nonzeros in the upper triangular matrix. */
357       nzUpper = adiag[0]-adiag[n];
358 
359       /* Allocate Space for the upper triangular matrix */
360       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
363 
364       /* Fill the upper triangular matrix */
365       AiUp[0]=(PetscInt) 0;
366       AiUp[n]=nzUpper;
367       offset = nzUpper;
368       for (i=n-1; i>=0; i--) {
369         v  = aa + adiag[i+1] + 1;
370         vi = aj + adiag[i+1] + 1;
371 
372         /* number of elements NOT on the diagonal */
373         nz = adiag[i] - adiag[i+1]-1;
374 
375         /* decrement the offset */
376         offset -= (nz+1);
377 
378         /* first, set the diagonal elements */
379         AjUp[offset] = (PetscInt) i;
380         AAUp[offset] = (MatScalar)1./v[nz];
381         AiUp[i]      = AiUp[i+1] - (nz+1);
382 
383         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
384         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
385       }
386 
387       /* allocate space for the triangular factor information */
388       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
389 
390       /* Create the matrix description */
391       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
396 
397       /* Create the solve analysis information */
398       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
399 
400       /* set the operation */
401       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
402 
403       /* set the matrix */
404       upTriFactor->csrMat = new CsrMatrix;
405       upTriFactor->csrMat->num_rows = n;
406       upTriFactor->csrMat->num_cols = n;
407       upTriFactor->csrMat->num_entries = nzUpper;
408 
409       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
411 
412       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
414 
415       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
417 
418       /* perform the solve analysis */
419       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
423 
424       /* assign the pointer. Is this really necessary? */
425       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
426 
427       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
430     } catch(char *ex) {
431       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
432     }
433   }
434   PetscFunctionReturn(0);
435 }
436 
437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
438 {
439   PetscErrorCode               ierr;
440   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
441   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
442   IS                           isrow = a->row,iscol = a->icol;
443   PetscBool                    row_identity,col_identity;
444   const PetscInt               *r,*c;
445   PetscInt                     n = A->rmap->n;
446 
447   PetscFunctionBegin;
448   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
449   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
450 
451   cusparseTriFactors->workVector = new THRUSTARRAY(n);
452   cusparseTriFactors->nnz=a->nz;
453 
454   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
455   /*lower triangular indices */
456   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
457   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
458   if (!row_identity) {
459     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460     cusparseTriFactors->rpermIndices->assign(r, r+n);
461   }
462   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
463 
464   /*upper triangular indices */
465   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
466   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
467   if (!col_identity) {
468     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469     cusparseTriFactors->cpermIndices->assign(c, c+n);
470   }
471   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
472   PetscFunctionReturn(0);
473 }
474 
475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
476 {
477   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
478   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
480   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
481   cusparseStatus_t                  stat;
482   PetscErrorCode                    ierr;
483   PetscInt                          *AiUp, *AjUp;
484   PetscScalar                       *AAUp;
485   PetscScalar                       *AALo;
486   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
487   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
488   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
489   const MatScalar                   *aa = b->a,*v;
490 
491   PetscFunctionBegin;
492   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
493     try {
494       /* Allocate Space for the upper triangular matrix */
495       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
496       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
497       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
498       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
499 
500       /* Fill the upper triangular matrix */
501       AiUp[0]=(PetscInt) 0;
502       AiUp[n]=nzUpper;
503       offset = 0;
504       for (i=0; i<n; i++) {
505         /* set the pointers */
506         v  = aa + ai[i];
507         vj = aj + ai[i];
508         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
509 
510         /* first, set the diagonal elements */
511         AjUp[offset] = (PetscInt) i;
512         AAUp[offset] = (MatScalar)1.0/v[nz];
513         AiUp[i]      = offset;
514         AALo[offset] = (MatScalar)1.0/v[nz];
515 
516         offset+=1;
517         if (nz>0) {
518           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
519           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
520           for (j=offset; j<offset+nz; j++) {
521             AAUp[j] = -AAUp[j];
522             AALo[j] = AAUp[j]/v[nz];
523           }
524           offset+=nz;
525         }
526       }
527 
528       /* allocate space for the triangular factor information */
529       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
530 
531       /* Create the matrix description */
532       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
533       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
534       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
535       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
536       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
537 
538       /* Create the solve analysis information */
539       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
540 
541       /* set the operation */
542       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
543 
544       /* set the matrix */
545       upTriFactor->csrMat = new CsrMatrix;
546       upTriFactor->csrMat->num_rows = A->rmap->n;
547       upTriFactor->csrMat->num_cols = A->cmap->n;
548       upTriFactor->csrMat->num_entries = a->nz;
549 
550       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
551       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
552 
553       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
554       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
555 
556       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
557       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
558 
559       /* perform the solve analysis */
560       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
561                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
562                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
563                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
564 
565       /* assign the pointer. Is this really necessary? */
566       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
567 
568       /* allocate space for the triangular factor information */
569       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
570 
571       /* Create the matrix description */
572       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
573       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
574       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
575       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
576       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
577 
578       /* Create the solve analysis information */
579       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
580 
581       /* set the operation */
582       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
583 
584       /* set the matrix */
585       loTriFactor->csrMat = new CsrMatrix;
586       loTriFactor->csrMat->num_rows = A->rmap->n;
587       loTriFactor->csrMat->num_cols = A->cmap->n;
588       loTriFactor->csrMat->num_entries = a->nz;
589 
590       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
591       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
592 
593       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
594       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
595 
596       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
597       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
598 
599       /* perform the solve analysis */
600       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
601                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
602                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
603                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
604 
605       /* assign the pointer. Is this really necessary? */
606       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
607 
608       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
609       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
610       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
611       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
612       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
613     } catch(char *ex) {
614       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
615     }
616   }
617   PetscFunctionReturn(0);
618 }
619 
620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
621 {
622   PetscErrorCode               ierr;
623   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
624   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625   IS                           ip = a->row;
626   const PetscInt               *rip;
627   PetscBool                    perm_identity;
628   PetscInt                     n = A->rmap->n;
629 
630   PetscFunctionBegin;
631   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
632   cusparseTriFactors->workVector = new THRUSTARRAY(n);
633   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
634 
635   /*lower triangular indices */
636   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
637   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
638   if (!perm_identity) {
639     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
640     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
641     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
642     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
643   }
644   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
645   PetscFunctionReturn(0);
646 }
647 
648 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
649 {
650   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
651   IS             isrow = b->row,iscol = b->col;
652   PetscBool      row_identity,col_identity;
653   PetscErrorCode ierr;
654 
655   PetscFunctionBegin;
656   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
657   /* determine which version of MatSolve needs to be used. */
658   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
659   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
660   if (row_identity && col_identity) {
661     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
662     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
663   } else {
664     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
665     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
666   }
667 
668   /* get the triangular factors */
669   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
670   PetscFunctionReturn(0);
671 }
672 
673 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
674 {
675   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
676   IS             ip = b->row;
677   PetscBool      perm_identity;
678   PetscErrorCode ierr;
679 
680   PetscFunctionBegin;
681   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
682 
683   /* determine which version of MatSolve needs to be used. */
684   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
685   if (perm_identity) {
686     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
687     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
688   } else {
689     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
690     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
691   }
692 
693   /* get the triangular factors */
694   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
699 {
700   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
701   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
702   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
704   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
705   cusparseStatus_t                  stat;
706   cusparseIndexBase_t               indexBase;
707   cusparseMatrixType_t              matrixType;
708   cusparseFillMode_t                fillMode;
709   cusparseDiagType_t                diagType;
710 
711   PetscFunctionBegin;
712 
713   /*********************************************/
714   /* Now the Transpose of the Lower Tri Factor */
715   /*********************************************/
716 
717   /* allocate space for the transpose of the lower triangular factor */
718   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
719 
720   /* set the matrix descriptors of the lower triangular factor */
721   matrixType = cusparseGetMatType(loTriFactor->descr);
722   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
723   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
724     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
725   diagType = cusparseGetMatDiagType(loTriFactor->descr);
726 
727   /* Create the matrix description */
728   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
729   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
730   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
731   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
732   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
733 
734   /* Create the solve analysis information */
735   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
736 
737   /* set the operation */
738   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
739 
740   /* allocate GPU space for the CSC of the lower triangular factor*/
741   loTriFactorT->csrMat = new CsrMatrix;
742   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
743   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
744   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
745   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
746   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
747   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
748 
749   /* compute the transpose of the lower triangular factor, i.e. the CSC */
750   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
751                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
752                           loTriFactor->csrMat->values->data().get(),
753                           loTriFactor->csrMat->row_offsets->data().get(),
754                           loTriFactor->csrMat->column_indices->data().get(),
755                           loTriFactorT->csrMat->values->data().get(),
756                           loTriFactorT->csrMat->column_indices->data().get(),
757                           loTriFactorT->csrMat->row_offsets->data().get(),
758                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
759 
760   /* perform the solve analysis on the transposed matrix */
761   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
762                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
763                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
764                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
765                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
766 
767   /* assign the pointer. Is this really necessary? */
768   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
769 
770   /*********************************************/
771   /* Now the Transpose of the Upper Tri Factor */
772   /*********************************************/
773 
774   /* allocate space for the transpose of the upper triangular factor */
775   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
776 
777   /* set the matrix descriptors of the upper triangular factor */
778   matrixType = cusparseGetMatType(upTriFactor->descr);
779   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
780   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
781     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
782   diagType = cusparseGetMatDiagType(upTriFactor->descr);
783 
784   /* Create the matrix description */
785   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
786   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
787   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
788   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
789   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
790 
791   /* Create the solve analysis information */
792   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
793 
794   /* set the operation */
795   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
796 
797   /* allocate GPU space for the CSC of the upper triangular factor*/
798   upTriFactorT->csrMat = new CsrMatrix;
799   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
800   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
801   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
802   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
803   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
804   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
805 
806   /* compute the transpose of the upper triangular factor, i.e. the CSC */
807   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
808                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
809                           upTriFactor->csrMat->values->data().get(),
810                           upTriFactor->csrMat->row_offsets->data().get(),
811                           upTriFactor->csrMat->column_indices->data().get(),
812                           upTriFactorT->csrMat->values->data().get(),
813                           upTriFactorT->csrMat->column_indices->data().get(),
814                           upTriFactorT->csrMat->row_offsets->data().get(),
815                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
816 
817   /* perform the solve analysis on the transposed matrix */
818   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
819                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
820                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
821                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
822                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
823 
824   /* assign the pointer. Is this really necessary? */
825   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
826   PetscFunctionReturn(0);
827 }
828 
829 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
830 {
831   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
832   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
833   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
834   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
835   cusparseStatus_t             stat;
836   cusparseIndexBase_t          indexBase;
837   cudaError_t                  err;
838 
839   PetscFunctionBegin;
840 
841   /* allocate space for the triangular factor information */
842   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
843   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
844   indexBase = cusparseGetMatIndexBase(matstruct->descr);
845   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
846   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
847 
848   /* set alpha and beta */
849   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
850   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
851   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
852   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
853   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
855   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
856 
857   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
858     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
859     CsrMatrix *matrixT= new CsrMatrix;
860     matrixT->num_rows = A->cmap->n;
861     matrixT->num_cols = A->rmap->n;
862     matrixT->num_entries = a->nz;
863     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
864     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
865     matrixT->values = new THRUSTARRAY(a->nz);
866 
867     /* compute the transpose of the upper triangular factor, i.e. the CSC */
868     indexBase = cusparseGetMatIndexBase(matstruct->descr);
869     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
870                             matrix->num_cols, matrix->num_entries,
871                             matrix->values->data().get(),
872                             matrix->row_offsets->data().get(),
873                             matrix->column_indices->data().get(),
874                             matrixT->values->data().get(),
875                             matrixT->column_indices->data().get(),
876                             matrixT->row_offsets->data().get(),
877                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
878 
879     /* assign the pointer */
880     matstructT->mat = matrixT;
881 
882   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
883 #if CUDA_VERSION>=5000
884     /* First convert HYB to CSR */
885     CsrMatrix *temp= new CsrMatrix;
886     temp->num_rows = A->rmap->n;
887     temp->num_cols = A->cmap->n;
888     temp->num_entries = a->nz;
889     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
890     temp->column_indices = new THRUSTINTARRAY32(a->nz);
891     temp->values = new THRUSTARRAY(a->nz);
892 
893 
894     stat = cusparse_hyb2csr(cusparsestruct->handle,
895                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
896                             temp->values->data().get(),
897                             temp->row_offsets->data().get(),
898                             temp->column_indices->data().get());CHKERRCUDA(stat);
899 
900     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
901     CsrMatrix *tempT= new CsrMatrix;
902     tempT->num_rows = A->rmap->n;
903     tempT->num_cols = A->cmap->n;
904     tempT->num_entries = a->nz;
905     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
906     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
907     tempT->values = new THRUSTARRAY(a->nz);
908 
909     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
910                             temp->num_cols, temp->num_entries,
911                             temp->values->data().get(),
912                             temp->row_offsets->data().get(),
913                             temp->column_indices->data().get(),
914                             tempT->values->data().get(),
915                             tempT->column_indices->data().get(),
916                             tempT->row_offsets->data().get(),
917                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
918 
919     /* Last, convert CSC to HYB */
920     cusparseHybMat_t hybMat;
921     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
922     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
923       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
924     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
925                             matstructT->descr, tempT->values->data().get(),
926                             tempT->row_offsets->data().get(),
927                             tempT->column_indices->data().get(),
928                             hybMat, 0, partition);CHKERRCUDA(stat);
929 
930     /* assign the pointer */
931     matstructT->mat = hybMat;
932 
933     /* delete temporaries */
934     if (tempT) {
935       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
936       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
937       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
938       delete (CsrMatrix*) tempT;
939     }
940     if (temp) {
941       if (temp->values) delete (THRUSTARRAY*) temp->values;
942       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
943       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
944       delete (CsrMatrix*) temp;
945     }
946 #else
947     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
948 #endif
949   }
950   /* assign the compressed row indices */
951   matstructT->cprowIndices = new THRUSTINTARRAY;
952   matstructT->cprowIndices->resize(A->cmap->n);
953   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
954 
955   /* assign the pointer */
956   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
961 {
962   PetscInt                              n = xx->map->n;
963   const PetscScalar                     *barray;
964   PetscScalar                           *xarray;
965   thrust::device_ptr<const PetscScalar> bGPU;
966   thrust::device_ptr<PetscScalar>       xGPU;
967   cusparseStatus_t                      stat;
968   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
969   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
970   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
971   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
972   PetscErrorCode                        ierr;
973 
974   PetscFunctionBegin;
975   /* Analyze the matrix and create the transpose ... on the fly */
976   if (!loTriFactorT && !upTriFactorT) {
977     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
978     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
979     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
980   }
981 
982   /* Get the GPU pointers */
983   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
984   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
985   xGPU = thrust::device_pointer_cast(xarray);
986   bGPU = thrust::device_pointer_cast(barray);
987 
988   /* First, reorder with the row permutation */
989   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
990                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
991                xGPU);
992 
993   /* First, solve U */
994   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
995                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
996                         upTriFactorT->csrMat->values->data().get(),
997                         upTriFactorT->csrMat->row_offsets->data().get(),
998                         upTriFactorT->csrMat->column_indices->data().get(),
999                         upTriFactorT->solveInfo,
1000                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1001 
1002   /* Then, solve L */
1003   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1004                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1005                         loTriFactorT->csrMat->values->data().get(),
1006                         loTriFactorT->csrMat->row_offsets->data().get(),
1007                         loTriFactorT->csrMat->column_indices->data().get(),
1008                         loTriFactorT->solveInfo,
1009                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1010 
1011   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1012   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1013                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1014                tempGPU->begin());
1015 
1016   /* Copy the temporary to the full solution. */
1017   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1018 
1019   /* restore */
1020   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1021   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1022   ierr = WaitForGPU();CHKERRCUDA(ierr);
1023 
1024   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1029 {
1030   const PetscScalar                 *barray;
1031   PetscScalar                       *xarray;
1032   cusparseStatus_t                  stat;
1033   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1034   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1035   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1036   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1037   PetscErrorCode                    ierr;
1038 
1039   PetscFunctionBegin;
1040   /* Analyze the matrix and create the transpose ... on the fly */
1041   if (!loTriFactorT && !upTriFactorT) {
1042     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1043     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1044     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1045   }
1046 
1047   /* Get the GPU pointers */
1048   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1049   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1050 
1051   /* First, solve U */
1052   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1053                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1054                         upTriFactorT->csrMat->values->data().get(),
1055                         upTriFactorT->csrMat->row_offsets->data().get(),
1056                         upTriFactorT->csrMat->column_indices->data().get(),
1057                         upTriFactorT->solveInfo,
1058                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1059 
1060   /* Then, solve L */
1061   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1062                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1063                         loTriFactorT->csrMat->values->data().get(),
1064                         loTriFactorT->csrMat->row_offsets->data().get(),
1065                         loTriFactorT->csrMat->column_indices->data().get(),
1066                         loTriFactorT->solveInfo,
1067                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1068 
1069   /* restore */
1070   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1071   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1072   ierr = WaitForGPU();CHKERRCUDA(ierr);
1073   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1074   PetscFunctionReturn(0);
1075 }
1076 
1077 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1078 {
1079   const PetscScalar                     *barray;
1080   PetscScalar                           *xarray;
1081   thrust::device_ptr<const PetscScalar> bGPU;
1082   thrust::device_ptr<PetscScalar>       xGPU;
1083   cusparseStatus_t                      stat;
1084   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1085   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1086   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1087   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1088   PetscErrorCode                        ierr;
1089 
1090   PetscFunctionBegin;
1091 
1092   /* Get the GPU pointers */
1093   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1094   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1095   xGPU = thrust::device_pointer_cast(xarray);
1096   bGPU = thrust::device_pointer_cast(barray);
1097 
1098   /* First, reorder with the row permutation */
1099   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1100                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1101                xGPU);
1102 
1103   /* Next, solve L */
1104   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1105                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1106                         loTriFactor->csrMat->values->data().get(),
1107                         loTriFactor->csrMat->row_offsets->data().get(),
1108                         loTriFactor->csrMat->column_indices->data().get(),
1109                         loTriFactor->solveInfo,
1110                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1111 
1112   /* Then, solve U */
1113   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1114                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1115                         upTriFactor->csrMat->values->data().get(),
1116                         upTriFactor->csrMat->row_offsets->data().get(),
1117                         upTriFactor->csrMat->column_indices->data().get(),
1118                         upTriFactor->solveInfo,
1119                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1120 
1121   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1122   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1123                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1124                tempGPU->begin());
1125 
1126   /* Copy the temporary to the full solution. */
1127   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1128 
1129   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1130   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1131   ierr = WaitForGPU();CHKERRCUDA(ierr);
1132   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1133   PetscFunctionReturn(0);
1134 }
1135 
1136 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1137 {
1138   const PetscScalar                 *barray;
1139   PetscScalar                       *xarray;
1140   cusparseStatus_t                  stat;
1141   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1142   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1143   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1144   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1145   PetscErrorCode                    ierr;
1146 
1147   PetscFunctionBegin;
1148   /* Get the GPU pointers */
1149   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1150   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1151 
1152   /* First, solve L */
1153   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1154                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1155                         loTriFactor->csrMat->values->data().get(),
1156                         loTriFactor->csrMat->row_offsets->data().get(),
1157                         loTriFactor->csrMat->column_indices->data().get(),
1158                         loTriFactor->solveInfo,
1159                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1160 
1161   /* Next, solve U */
1162   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1163                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1164                         upTriFactor->csrMat->values->data().get(),
1165                         upTriFactor->csrMat->row_offsets->data().get(),
1166                         upTriFactor->csrMat->column_indices->data().get(),
1167                         upTriFactor->solveInfo,
1168                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1169 
1170   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1171   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1172   ierr = WaitForGPU();CHKERRCUDA(ierr);
1173   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1178 {
1179 
1180   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1181   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1182   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1183   PetscInt                     m = A->rmap->n,*ii,*ridx;
1184   PetscErrorCode               ierr;
1185   cusparseStatus_t             stat;
1186   cudaError_t                  err;
1187 
1188   PetscFunctionBegin;
1189   if (A->pinnedtocpu) PetscFunctionReturn(0);
1190   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1191     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1192     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1193       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1194       /* copy values only */
1195       matrix->values->assign(a->a, a->a+a->nz);
1196     } else {
1197       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1198       try {
1199         cusparsestruct->nonzerorow=0;
1200         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1201 
1202         if (a->compressedrow.use) {
1203           m    = a->compressedrow.nrows;
1204           ii   = a->compressedrow.i;
1205           ridx = a->compressedrow.rindex;
1206         } else {
1207           /* Forcing compressed row on the GPU */
1208           int k=0;
1209           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1210           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1211           ii[0]=0;
1212           for (int j = 0; j<m; j++) {
1213             if ((a->i[j+1]-a->i[j])>0) {
1214               ii[k]  = a->i[j];
1215               ridx[k]= j;
1216               k++;
1217             }
1218           }
1219           ii[cusparsestruct->nonzerorow] = a->nz;
1220           m = cusparsestruct->nonzerorow;
1221         }
1222 
1223         /* allocate space for the triangular factor information */
1224         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1225         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1226         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1227         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1228 
1229         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1230         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1231         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1232         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1233         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1234         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1235         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1236 
1237         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1238         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1239           /* set the matrix */
1240           CsrMatrix *matrix= new CsrMatrix;
1241           matrix->num_rows = m;
1242           matrix->num_cols = A->cmap->n;
1243           matrix->num_entries = a->nz;
1244           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1245           matrix->row_offsets->assign(ii, ii + m+1);
1246 
1247           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1248           matrix->column_indices->assign(a->j, a->j+a->nz);
1249 
1250           matrix->values = new THRUSTARRAY(a->nz);
1251           matrix->values->assign(a->a, a->a+a->nz);
1252 
1253           /* assign the pointer */
1254           matstruct->mat = matrix;
1255 
1256         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1257 #if CUDA_VERSION>=4020
1258           CsrMatrix *matrix= new CsrMatrix;
1259           matrix->num_rows = m;
1260           matrix->num_cols = A->cmap->n;
1261           matrix->num_entries = a->nz;
1262           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1263           matrix->row_offsets->assign(ii, ii + m+1);
1264 
1265           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1266           matrix->column_indices->assign(a->j, a->j+a->nz);
1267 
1268           matrix->values = new THRUSTARRAY(a->nz);
1269           matrix->values->assign(a->a, a->a+a->nz);
1270 
1271           cusparseHybMat_t hybMat;
1272           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1273           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1274             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1275           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1276               matstruct->descr, matrix->values->data().get(),
1277               matrix->row_offsets->data().get(),
1278               matrix->column_indices->data().get(),
1279               hybMat, 0, partition);CHKERRCUDA(stat);
1280           /* assign the pointer */
1281           matstruct->mat = hybMat;
1282 
1283           if (matrix) {
1284             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1285             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1286             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1287             delete (CsrMatrix*)matrix;
1288           }
1289 #endif
1290         }
1291 
1292         /* assign the compressed row indices */
1293         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1294         matstruct->cprowIndices->assign(ridx,ridx+m);
1295 
1296         /* assign the pointer */
1297         cusparsestruct->mat = matstruct;
1298 
1299         if (!a->compressedrow.use) {
1300           ierr = PetscFree(ii);CHKERRQ(ierr);
1301           ierr = PetscFree(ridx);CHKERRQ(ierr);
1302         }
1303         cusparsestruct->workVector = new THRUSTARRAY(m);
1304       } catch(char *ex) {
1305         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1306       }
1307       cusparsestruct->nonzerostate = A->nonzerostate;
1308     }
1309     ierr = WaitForGPU();CHKERRCUDA(ierr);
1310     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1311     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1312   }
1313   PetscFunctionReturn(0);
1314 }
1315 
1316 struct VecCUDAPlusEquals
1317 {
1318   template <typename Tuple>
1319   __host__ __device__
1320   void operator()(Tuple t)
1321   {
1322     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1323   }
1324 };
1325 
1326 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1327 {
1328   PetscErrorCode ierr;
1329 
1330   PetscFunctionBegin;
1331   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1332   PetscFunctionReturn(0);
1333 }
1334 
1335 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1336 {
1337   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1338   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1339   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1340   const PetscScalar            *xarray;
1341   PetscScalar                  *yarray;
1342   PetscErrorCode               ierr;
1343   cusparseStatus_t             stat;
1344 
1345   PetscFunctionBegin;
1346   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1347   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1348   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1349   if (!matstructT) {
1350     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1351     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1352   }
1353   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1354   ierr = VecSet(yy,0);CHKERRQ(ierr);
1355   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1356 
1357   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1358     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1359     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1360                              mat->num_rows, mat->num_cols,
1361                              mat->num_entries, matstructT->alpha, matstructT->descr,
1362                              mat->values->data().get(), mat->row_offsets->data().get(),
1363                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1364                              yarray);CHKERRCUDA(stat);
1365   } else {
1366 #if CUDA_VERSION>=4020
1367     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1368     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1369                              matstructT->alpha, matstructT->descr, hybMat,
1370                              xarray, matstructT->beta_zero,
1371                              yarray);CHKERRCUDA(stat);
1372 #endif
1373   }
1374   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1375   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1376   if (!cusparsestruct->stream) {
1377     ierr = WaitForGPU();CHKERRCUDA(ierr);
1378   }
1379   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1380   PetscFunctionReturn(0);
1381 }
1382 
1383 
1384 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1385 {
1386   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1387   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1388   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1389   const PetscScalar            *xarray;
1390   PetscScalar                  *zarray,*dptr,*beta;
1391   PetscErrorCode               ierr;
1392   cusparseStatus_t             stat;
1393 
1394   PetscFunctionBegin;
1395   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1396   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1397   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1398   try {
1399     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1400     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1401     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1402     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1403 
1404     /* csr_spmv is multiply add */
1405     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1406       /* here we need to be careful to set the number of rows in the multiply to the
1407          number of compressed rows in the matrix ... which is equivalent to the
1408          size of the workVector */
1409       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1410       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1411                                mat->num_rows, mat->num_cols,
1412                                mat->num_entries, matstruct->alpha, matstruct->descr,
1413                                mat->values->data().get(), mat->row_offsets->data().get(),
1414                                mat->column_indices->data().get(), xarray, beta,
1415                                dptr);CHKERRCUDA(stat);
1416     } else {
1417 #if CUDA_VERSION>=4020
1418       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1419       if (cusparsestruct->workVector->size()) {
1420         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1421                                  matstruct->alpha, matstruct->descr, hybMat,
1422                                  xarray, beta,
1423                                  dptr);CHKERRCUDA(stat);
1424       }
1425 #endif
1426     }
1427 
1428     if (yy) {
1429       if (dptr != zarray) {
1430         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1431       } else if (zz != yy) {
1432         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1433       }
1434     } else if (dptr != zarray) {
1435       ierr = VecSet(zz,0);CHKERRQ(ierr);
1436     }
1437     /* scatter the data from the temporary into the full vector with a += operation */
1438     if (dptr != zarray) {
1439       thrust::device_ptr<PetscScalar> zptr;
1440 
1441       zptr = thrust::device_pointer_cast(zarray);
1442       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1443                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1444                        VecCUDAPlusEquals());
1445     }
1446     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1447     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1448   } catch(char *ex) {
1449     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1450   }
1451   if (!yy) { /* MatMult */
1452     if (!cusparsestruct->stream) {
1453       ierr = WaitForGPU();CHKERRCUDA(ierr);
1454     }
1455   }
1456   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1461 {
1462   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1463   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1464   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1465   thrust::device_ptr<PetscScalar> zptr;
1466   const PetscScalar               *xarray;
1467   PetscScalar                     *zarray;
1468   PetscErrorCode                  ierr;
1469   cusparseStatus_t                stat;
1470 
1471   PetscFunctionBegin;
1472   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1473   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1474   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1475   if (!matstructT) {
1476     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1477     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1478   }
1479 
1480   try {
1481     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1482     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1483     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1484     zptr = thrust::device_pointer_cast(zarray);
1485 
1486     /* multiply add with matrix transpose */
1487     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1488       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1489       /* here we need to be careful to set the number of rows in the multiply to the
1490          number of compressed rows in the matrix ... which is equivalent to the
1491          size of the workVector */
1492       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1493                                mat->num_rows, mat->num_cols,
1494                                mat->num_entries, matstructT->alpha, matstructT->descr,
1495                                mat->values->data().get(), mat->row_offsets->data().get(),
1496                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1497                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1498     } else {
1499 #if CUDA_VERSION>=4020
1500       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1501       if (cusparsestruct->workVector->size()) {
1502         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1503             matstructT->alpha, matstructT->descr, hybMat,
1504             xarray, matstructT->beta_zero,
1505             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1506       }
1507 #endif
1508     }
1509 
1510     /* scatter the data from the temporary into the full vector with a += operation */
1511     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1512         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1513         VecCUDAPlusEquals());
1514 
1515     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1516     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1517 
1518   } catch(char *ex) {
1519     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1520   }
1521   ierr = WaitForGPU();CHKERRCUDA(ierr);
1522   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1523   PetscFunctionReturn(0);
1524 }
1525 
1526 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1527 {
1528   PetscErrorCode ierr;
1529 
1530   PetscFunctionBegin;
1531   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1532   if (A->factortype==MAT_FACTOR_NONE) {
1533     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1534   }
1535   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1536   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1537   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1538   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1539   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 /* --------------------------------------------------------------------------------*/
1544 /*@
1545    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1546    (the default parallel PETSc format). This matrix will ultimately pushed down
1547    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1548    assembly performance the user should preallocate the matrix storage by setting
1549    the parameter nz (or the array nnz).  By setting these parameters accurately,
1550    performance during matrix assembly can be increased by more than a factor of 50.
1551 
1552    Collective
1553 
1554    Input Parameters:
1555 +  comm - MPI communicator, set to PETSC_COMM_SELF
1556 .  m - number of rows
1557 .  n - number of columns
1558 .  nz - number of nonzeros per row (same for all rows)
1559 -  nnz - array containing the number of nonzeros in the various rows
1560          (possibly different for each row) or NULL
1561 
1562    Output Parameter:
1563 .  A - the matrix
1564 
1565    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1566    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1567    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1568 
1569    Notes:
1570    If nnz is given then nz is ignored
1571 
1572    The AIJ format (also called the Yale sparse matrix format or
1573    compressed row storage), is fully compatible with standard Fortran 77
1574    storage.  That is, the stored row and column indices can begin at
1575    either one (as in Fortran) or zero.  See the users' manual for details.
1576 
1577    Specify the preallocated storage with either nz or nnz (not both).
1578    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1579    allocation.  For large problems you MUST preallocate memory or you
1580    will get TERRIBLE performance, see the users' manual chapter on matrices.
1581 
1582    By default, this format uses inodes (identical nodes) when possible, to
1583    improve numerical efficiency of matrix-vector products and solves. We
1584    search for consecutive rows with the same nonzero structure, thereby
1585    reusing matrix information to achieve increased efficiency.
1586 
1587    Level: intermediate
1588 
1589 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1590 @*/
1591 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1592 {
1593   PetscErrorCode ierr;
1594 
1595   PetscFunctionBegin;
1596   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1597   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1598   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1599   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1600   PetscFunctionReturn(0);
1601 }
1602 
1603 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1604 {
1605   PetscErrorCode   ierr;
1606 
1607   PetscFunctionBegin;
1608   if (A->factortype==MAT_FACTOR_NONE) {
1609     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1610       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1611     }
1612   } else {
1613     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1614   }
1615   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1616   PetscFunctionReturn(0);
1617 }
1618 
1619 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1620 {
1621   PetscErrorCode ierr;
1622   Mat C;
1623   cusparseStatus_t stat;
1624   cusparseHandle_t handle=0;
1625 
1626   PetscFunctionBegin;
1627   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1628   C    = *B;
1629   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1630   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1631 
1632   /* inject CUSPARSE-specific stuff */
1633   if (C->factortype==MAT_FACTOR_NONE) {
1634     /* you cannot check the inode.use flag here since the matrix was just created.
1635        now build a GPU matrix data structure */
1636     C->spptr = new Mat_SeqAIJCUSPARSE;
1637     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1638     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1639     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1640     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1641     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1642     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1643     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1644     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1645   } else {
1646     /* NEXT, set the pointers to the triangular factors */
1647     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1648     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1649     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1650     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1651     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1652     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1653     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1654     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1655     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1656     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1657     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1658     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1659   }
1660 
1661   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1662   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1663   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1664   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1665   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1666   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1667   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1668   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1669 
1670   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1671 
1672   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1673 
1674   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1675   PetscFunctionReturn(0);
1676 }
1677 
1678 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1679 {
1680   PetscErrorCode ierr;
1681   cusparseStatus_t stat;
1682   cusparseHandle_t handle=0;
1683 
1684   PetscFunctionBegin;
1685   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1686   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1687 
1688   if (B->factortype==MAT_FACTOR_NONE) {
1689     /* you cannot check the inode.use flag here since the matrix was just created.
1690        now build a GPU matrix data structure */
1691     B->spptr = new Mat_SeqAIJCUSPARSE;
1692     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1693     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1694     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1695     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1696     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1697     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1698     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1699     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1700   } else {
1701     /* NEXT, set the pointers to the triangular factors */
1702     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1703     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1704     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1705     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1706     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1707     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1708     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1709     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1710     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1711     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1712     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1713   }
1714 
1715   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1716   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1717   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1718   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1719   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1720   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1721   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1722   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1723 
1724   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1725 
1726   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1727 
1728   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1733 {
1734   PetscErrorCode ierr;
1735 
1736   PetscFunctionBegin;
1737   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1738   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1739   PetscFunctionReturn(0);
1740 }
1741 
1742 /*MC
1743    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1744 
1745    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1746    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1747    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1748 
1749    Options Database Keys:
1750 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1751 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1752 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1753 
1754   Level: beginner
1755 
1756 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1757 M*/
1758 
1759 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1760 
1761 
1762 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1763 {
1764   PetscErrorCode ierr;
1765 
1766   PetscFunctionBegin;
1767   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1768   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1769   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1770   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1771   PetscFunctionReturn(0);
1772 }
1773 
1774 
1775 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1776 {
1777   cusparseStatus_t stat;
1778   cusparseHandle_t handle;
1779 
1780   PetscFunctionBegin;
1781   if (*cusparsestruct) {
1782     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1783     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1784     delete (*cusparsestruct)->workVector;
1785     if (handle = (*cusparsestruct)->handle) {
1786       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1787     }
1788     delete *cusparsestruct;
1789     *cusparsestruct = 0;
1790   }
1791   PetscFunctionReturn(0);
1792 }
1793 
1794 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1795 {
1796   PetscFunctionBegin;
1797   if (*mat) {
1798     delete (*mat)->values;
1799     delete (*mat)->column_indices;
1800     delete (*mat)->row_offsets;
1801     delete *mat;
1802     *mat = 0;
1803   }
1804   PetscFunctionReturn(0);
1805 }
1806 
1807 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1808 {
1809   cusparseStatus_t stat;
1810   PetscErrorCode   ierr;
1811 
1812   PetscFunctionBegin;
1813   if (*trifactor) {
1814     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1815     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1816     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1817     delete *trifactor;
1818     *trifactor = 0;
1819   }
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1824 {
1825   CsrMatrix        *mat;
1826   cusparseStatus_t stat;
1827   cudaError_t      err;
1828 
1829   PetscFunctionBegin;
1830   if (*matstruct) {
1831     if ((*matstruct)->mat) {
1832       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1833         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1834         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1835       } else {
1836         mat = (CsrMatrix*)(*matstruct)->mat;
1837         CsrMatrix_Destroy(&mat);
1838       }
1839     }
1840     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1841     delete (*matstruct)->cprowIndices;
1842     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1843     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1844     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1845     delete *matstruct;
1846     *matstruct = 0;
1847   }
1848   PetscFunctionReturn(0);
1849 }
1850 
1851 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1852 {
1853   cusparseHandle_t handle;
1854   cusparseStatus_t stat;
1855 
1856   PetscFunctionBegin;
1857   if (*trifactors) {
1858     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1859     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1860     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1861     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1862     delete (*trifactors)->rpermIndices;
1863     delete (*trifactors)->cpermIndices;
1864     delete (*trifactors)->workVector;
1865     if (handle = (*trifactors)->handle) {
1866       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1867     }
1868     delete *trifactors;
1869     *trifactors = 0;
1870   }
1871   PetscFunctionReturn(0);
1872 }
1873 
1874