xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 57a5760de5cdf3787b07bed859a287995d4838a2)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189   if (flg) {
190     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 
195 }
196 
197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
223   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
233   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240   PetscInt                          n = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t                  stat;
244   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245   const MatScalar                   *aa = a->a,*v;
246   PetscInt                          *AiLo, *AjLo;
247   PetscScalar                       *AALo;
248   PetscInt                          i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode                    ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353 
354   PetscFunctionBegin;
355   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457   /*lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /*upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if(!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504     try {
505       /* Allocate Space for the upper triangular matrix */
506       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510 
511       /* Fill the upper triangular matrix */
512       AiUp[0]=(PetscInt) 0;
513       AiUp[n]=nzUpper;
514       offset = 0;
515       for (i=0; i<n; i++) {
516         /* set the pointers */
517         v  = aa + ai[i];
518         vj = aj + ai[i];
519         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
520 
521         /* first, set the diagonal elements */
522         AjUp[offset] = (PetscInt) i;
523         AAUp[offset] = (MatScalar)1.0/v[nz];
524         AiUp[i]      = offset;
525         AALo[offset] = (MatScalar)1.0/v[nz];
526 
527         offset+=1;
528         if (nz>0) {
529           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
530           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
531           for (j=offset; j<offset+nz; j++) {
532             AAUp[j] = -AAUp[j];
533             AALo[j] = AAUp[j]/v[nz];
534           }
535           offset+=nz;
536         }
537       }
538 
539       /* allocate space for the triangular factor information */
540       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
541 
542       /* Create the matrix description */
543       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
548 
549       /* Create the solve analysis information */
550       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
551 
552       /* set the operation */
553       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
554 
555       /* set the matrix */
556       upTriFactor->csrMat = new CsrMatrix;
557       upTriFactor->csrMat->num_rows = A->rmap->n;
558       upTriFactor->csrMat->num_cols = A->cmap->n;
559       upTriFactor->csrMat->num_entries = a->nz;
560 
561       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
563 
564       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
566 
567       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
569 
570       /* perform the solve analysis */
571       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
575 
576       /* assign the pointer. Is this really necessary? */
577       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
578 
579       /* allocate space for the triangular factor information */
580       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
581 
582       /* Create the matrix description */
583       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
588 
589       /* Create the solve analysis information */
590       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
591 
592       /* set the operation */
593       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
594 
595       /* set the matrix */
596       loTriFactor->csrMat = new CsrMatrix;
597       loTriFactor->csrMat->num_rows = A->rmap->n;
598       loTriFactor->csrMat->num_cols = A->cmap->n;
599       loTriFactor->csrMat->num_entries = a->nz;
600 
601       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
603 
604       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
606 
607       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
610 
611       /* perform the solve analysis */
612       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
616 
617       /* assign the pointer. Is this really necessary? */
618       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
619 
620       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
625     } catch(char *ex) {
626       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627     }
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633 {
634   PetscErrorCode               ierr;
635   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637   IS                           ip = a->row;
638   const PetscInt               *rip;
639   PetscBool                    perm_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
644   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
646 
647   /*lower triangular indices */
648   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
649   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
650   if (!perm_identity) {
651     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
652     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
653     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
654     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
655     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
656   }
657   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
662 {
663   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
664   IS             isrow = b->row,iscol = b->col;
665   PetscBool      row_identity,col_identity;
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
670   /* determine which version of MatSolve needs to be used. */
671   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
672   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
673   if (row_identity && col_identity) {
674     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
675     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
676   } else {
677     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
678     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
679   }
680 
681   /* get the triangular factors */
682   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
687 {
688   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
689   IS             ip = b->row;
690   PetscBool      perm_identity;
691   PetscErrorCode ierr;
692 
693   PetscFunctionBegin;
694   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
695 
696   /* determine which version of MatSolve needs to be used. */
697   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
698   if (perm_identity) {
699     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
700     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
701   } else {
702     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
703     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
704   }
705 
706   /* get the triangular factors */
707   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 
711 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
712 {
713   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
716   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
717   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
718   cusparseStatus_t                  stat;
719   cusparseIndexBase_t               indexBase;
720   cusparseMatrixType_t              matrixType;
721   cusparseFillMode_t                fillMode;
722   cusparseDiagType_t                diagType;
723 
724   PetscFunctionBegin;
725 
726   /*********************************************/
727   /* Now the Transpose of the Lower Tri Factor */
728   /*********************************************/
729 
730   /* allocate space for the transpose of the lower triangular factor */
731   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
732 
733   /* set the matrix descriptors of the lower triangular factor */
734   matrixType = cusparseGetMatType(loTriFactor->descr);
735   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
736   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
737     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
738   diagType = cusparseGetMatDiagType(loTriFactor->descr);
739 
740   /* Create the matrix description */
741   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
742   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
743   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
744   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
745   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
746 
747   /* Create the solve analysis information */
748   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
749 
750   /* set the operation */
751   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
752 
753   /* allocate GPU space for the CSC of the lower triangular factor*/
754   loTriFactorT->csrMat = new CsrMatrix;
755   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
756   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
757   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
758   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
759   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
760   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
761 
762   /* compute the transpose of the lower triangular factor, i.e. the CSC */
763   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
764                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
765                           loTriFactor->csrMat->values->data().get(),
766                           loTriFactor->csrMat->row_offsets->data().get(),
767                           loTriFactor->csrMat->column_indices->data().get(),
768                           loTriFactorT->csrMat->values->data().get(),
769                           loTriFactorT->csrMat->column_indices->data().get(),
770                           loTriFactorT->csrMat->row_offsets->data().get(),
771                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
772 
773   /* perform the solve analysis on the transposed matrix */
774   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
775                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
776                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
777                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
778                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
779 
780   /* assign the pointer. Is this really necessary? */
781   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
782 
783   /*********************************************/
784   /* Now the Transpose of the Upper Tri Factor */
785   /*********************************************/
786 
787   /* allocate space for the transpose of the upper triangular factor */
788   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
789 
790   /* set the matrix descriptors of the upper triangular factor */
791   matrixType = cusparseGetMatType(upTriFactor->descr);
792   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
793   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
794     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
795   diagType = cusparseGetMatDiagType(upTriFactor->descr);
796 
797   /* Create the matrix description */
798   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
799   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
800   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
801   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
802   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
803 
804   /* Create the solve analysis information */
805   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
806 
807   /* set the operation */
808   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
809 
810   /* allocate GPU space for the CSC of the upper triangular factor*/
811   upTriFactorT->csrMat = new CsrMatrix;
812   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
813   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
814   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
815   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
816   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
817   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
818 
819   /* compute the transpose of the upper triangular factor, i.e. the CSC */
820   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
821                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
822                           upTriFactor->csrMat->values->data().get(),
823                           upTriFactor->csrMat->row_offsets->data().get(),
824                           upTriFactor->csrMat->column_indices->data().get(),
825                           upTriFactorT->csrMat->values->data().get(),
826                           upTriFactorT->csrMat->column_indices->data().get(),
827                           upTriFactorT->csrMat->row_offsets->data().get(),
828                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
829 
830   /* perform the solve analysis on the transposed matrix */
831   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
832                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
833                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
834                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
835                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
836 
837   /* assign the pointer. Is this really necessary? */
838   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
839   PetscFunctionReturn(0);
840 }
841 
842 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
843 {
844   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
845   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
846   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
847   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
848   cusparseStatus_t             stat;
849   cusparseIndexBase_t          indexBase;
850   cudaError_t                  err;
851   PetscErrorCode               ierr;
852 
853   PetscFunctionBegin;
854 
855   /* allocate space for the triangular factor information */
856   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
857   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
858   indexBase = cusparseGetMatIndexBase(matstruct->descr);
859   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
860   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
861 
862   /* set alpha and beta */
863   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
870 
871   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
872     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
873     CsrMatrix *matrixT= new CsrMatrix;
874     matrixT->num_rows = A->cmap->n;
875     matrixT->num_cols = A->rmap->n;
876     matrixT->num_entries = a->nz;
877     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
878     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
879     matrixT->values = new THRUSTARRAY(a->nz);
880 
881     /* compute the transpose of the upper triangular factor, i.e. the CSC */
882     indexBase = cusparseGetMatIndexBase(matstruct->descr);
883     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
884                             matrix->num_cols, matrix->num_entries,
885                             matrix->values->data().get(),
886                             matrix->row_offsets->data().get(),
887                             matrix->column_indices->data().get(),
888                             matrixT->values->data().get(),
889                             matrixT->column_indices->data().get(),
890                             matrixT->row_offsets->data().get(),
891                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
892 
893     /* assign the pointer */
894     matstructT->mat = matrixT;
895     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
896   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
897 #if CUDA_VERSION>=5000
898     /* First convert HYB to CSR */
899     CsrMatrix *temp= new CsrMatrix;
900     temp->num_rows = A->rmap->n;
901     temp->num_cols = A->cmap->n;
902     temp->num_entries = a->nz;
903     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904     temp->column_indices = new THRUSTINTARRAY32(a->nz);
905     temp->values = new THRUSTARRAY(a->nz);
906 
907 
908     stat = cusparse_hyb2csr(cusparsestruct->handle,
909                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
910                             temp->values->data().get(),
911                             temp->row_offsets->data().get(),
912                             temp->column_indices->data().get());CHKERRCUDA(stat);
913 
914     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
915     CsrMatrix *tempT= new CsrMatrix;
916     tempT->num_rows = A->rmap->n;
917     tempT->num_cols = A->cmap->n;
918     tempT->num_entries = a->nz;
919     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
921     tempT->values = new THRUSTARRAY(a->nz);
922 
923     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
924                             temp->num_cols, temp->num_entries,
925                             temp->values->data().get(),
926                             temp->row_offsets->data().get(),
927                             temp->column_indices->data().get(),
928                             tempT->values->data().get(),
929                             tempT->column_indices->data().get(),
930                             tempT->row_offsets->data().get(),
931                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
932 
933     /* Last, convert CSC to HYB */
934     cusparseHybMat_t hybMat;
935     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
936     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
937       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
938     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
939                             matstructT->descr, tempT->values->data().get(),
940                             tempT->row_offsets->data().get(),
941                             tempT->column_indices->data().get(),
942                             hybMat, 0, partition);CHKERRCUDA(stat);
943 
944     /* assign the pointer */
945     matstructT->mat = hybMat;
946     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
947 
948     /* delete temporaries */
949     if (tempT) {
950       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
951       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
952       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
953       delete (CsrMatrix*) tempT;
954     }
955     if (temp) {
956       if (temp->values) delete (THRUSTARRAY*) temp->values;
957       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
958       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
959       delete (CsrMatrix*) temp;
960     }
961 #else
962     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
963 #endif
964   }
965   /* assign the compressed row indices */
966   matstructT->cprowIndices = new THRUSTINTARRAY;
967   matstructT->cprowIndices->resize(A->cmap->n);
968   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
969   /* assign the pointer */
970   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
971   PetscFunctionReturn(0);
972 }
973 
974 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
975 {
976   PetscInt                              n = xx->map->n;
977   const PetscScalar                     *barray;
978   PetscScalar                           *xarray;
979   thrust::device_ptr<const PetscScalar> bGPU;
980   thrust::device_ptr<PetscScalar>       xGPU;
981   cusparseStatus_t                      stat;
982   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
983   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
984   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
985   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
986   PetscErrorCode                        ierr;
987 
988   PetscFunctionBegin;
989   /* Analyze the matrix and create the transpose ... on the fly */
990   if (!loTriFactorT && !upTriFactorT) {
991     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
992     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
993     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
994   }
995 
996   /* Get the GPU pointers */
997   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
998   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
999   xGPU = thrust::device_pointer_cast(xarray);
1000   bGPU = thrust::device_pointer_cast(barray);
1001 
1002   /* First, reorder with the row permutation */
1003   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1004                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1005                xGPU);
1006 
1007   /* First, solve U */
1008   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1009                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1010                         upTriFactorT->csrMat->values->data().get(),
1011                         upTriFactorT->csrMat->row_offsets->data().get(),
1012                         upTriFactorT->csrMat->column_indices->data().get(),
1013                         upTriFactorT->solveInfo,
1014                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1015 
1016   /* Then, solve L */
1017   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1018                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1019                         loTriFactorT->csrMat->values->data().get(),
1020                         loTriFactorT->csrMat->row_offsets->data().get(),
1021                         loTriFactorT->csrMat->column_indices->data().get(),
1022                         loTriFactorT->solveInfo,
1023                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1024 
1025   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1026   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1027                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1028                tempGPU->begin());
1029 
1030   /* Copy the temporary to the full solution. */
1031   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1032 
1033   /* restore */
1034   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1035   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1036   ierr = WaitForGPU();CHKERRCUDA(ierr);
1037 
1038   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1039   PetscFunctionReturn(0);
1040 }
1041 
1042 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1043 {
1044   const PetscScalar                 *barray;
1045   PetscScalar                       *xarray;
1046   cusparseStatus_t                  stat;
1047   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1048   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1049   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1050   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1051   PetscErrorCode                    ierr;
1052 
1053   PetscFunctionBegin;
1054   /* Analyze the matrix and create the transpose ... on the fly */
1055   if (!loTriFactorT && !upTriFactorT) {
1056     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1057     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1058     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1059   }
1060 
1061   /* Get the GPU pointers */
1062   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1063   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1064 
1065   /* First, solve U */
1066   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1067                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1068                         upTriFactorT->csrMat->values->data().get(),
1069                         upTriFactorT->csrMat->row_offsets->data().get(),
1070                         upTriFactorT->csrMat->column_indices->data().get(),
1071                         upTriFactorT->solveInfo,
1072                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1073 
1074   /* Then, solve L */
1075   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1077                         loTriFactorT->csrMat->values->data().get(),
1078                         loTriFactorT->csrMat->row_offsets->data().get(),
1079                         loTriFactorT->csrMat->column_indices->data().get(),
1080                         loTriFactorT->solveInfo,
1081                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1082 
1083   /* restore */
1084   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1085   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1086   ierr = WaitForGPU();CHKERRCUDA(ierr);
1087   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1088   PetscFunctionReturn(0);
1089 }
1090 
1091 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1092 {
1093   const PetscScalar                     *barray;
1094   PetscScalar                           *xarray;
1095   thrust::device_ptr<const PetscScalar> bGPU;
1096   thrust::device_ptr<PetscScalar>       xGPU;
1097   cusparseStatus_t                      stat;
1098   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1099   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1100   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1101   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1102   PetscErrorCode                        ierr;
1103 
1104   PetscFunctionBegin;
1105 
1106   /* Get the GPU pointers */
1107   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1108   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1109   xGPU = thrust::device_pointer_cast(xarray);
1110   bGPU = thrust::device_pointer_cast(barray);
1111 
1112   /* First, reorder with the row permutation */
1113   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1114                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1115                xGPU);
1116 
1117   /* Next, solve L */
1118   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1119                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1120                         loTriFactor->csrMat->values->data().get(),
1121                         loTriFactor->csrMat->row_offsets->data().get(),
1122                         loTriFactor->csrMat->column_indices->data().get(),
1123                         loTriFactor->solveInfo,
1124                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1125 
1126   /* Then, solve U */
1127   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1128                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1129                         upTriFactor->csrMat->values->data().get(),
1130                         upTriFactor->csrMat->row_offsets->data().get(),
1131                         upTriFactor->csrMat->column_indices->data().get(),
1132                         upTriFactor->solveInfo,
1133                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1134 
1135   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1136   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1137                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1138                tempGPU->begin());
1139 
1140   /* Copy the temporary to the full solution. */
1141   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1142 
1143   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1144   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1145   ierr = WaitForGPU();CHKERRCUDA(ierr);
1146   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1151 {
1152   const PetscScalar                 *barray;
1153   PetscScalar                       *xarray;
1154   cusparseStatus_t                  stat;
1155   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1156   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1157   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1158   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1159   PetscErrorCode                    ierr;
1160 
1161   PetscFunctionBegin;
1162   /* Get the GPU pointers */
1163   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1164   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1165 
1166   /* First, solve L */
1167   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1168                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1169                         loTriFactor->csrMat->values->data().get(),
1170                         loTriFactor->csrMat->row_offsets->data().get(),
1171                         loTriFactor->csrMat->column_indices->data().get(),
1172                         loTriFactor->solveInfo,
1173                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1174 
1175   /* Next, solve U */
1176   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1177                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1178                         upTriFactor->csrMat->values->data().get(),
1179                         upTriFactor->csrMat->row_offsets->data().get(),
1180                         upTriFactor->csrMat->column_indices->data().get(),
1181                         upTriFactor->solveInfo,
1182                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1183 
1184   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1185   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1186   ierr = WaitForGPU();CHKERRCUDA(ierr);
1187   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1192 {
1193 
1194   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1195   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1196   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1197   PetscInt                     m = A->rmap->n,*ii,*ridx;
1198   PetscErrorCode               ierr;
1199   cusparseStatus_t             stat;
1200   cudaError_t                  err;
1201 
1202   PetscFunctionBegin;
1203   if (A->pinnedtocpu) PetscFunctionReturn(0);
1204   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1205     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1206     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1207       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1208       /* copy values only */
1209       matrix->values->assign(a->a, a->a+a->nz);
1210       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1211     } else {
1212       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1213       try {
1214         cusparsestruct->nonzerorow=0;
1215         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1216 
1217         if (a->compressedrow.use) {
1218           m    = a->compressedrow.nrows;
1219           ii   = a->compressedrow.i;
1220           ridx = a->compressedrow.rindex;
1221         } else {
1222           /* Forcing compressed row on the GPU */
1223           int k=0;
1224           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1225           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1226           ii[0]=0;
1227           for (int j = 0; j<m; j++) {
1228             if ((a->i[j+1]-a->i[j])>0) {
1229               ii[k]  = a->i[j];
1230               ridx[k]= j;
1231               k++;
1232             }
1233           }
1234           ii[cusparsestruct->nonzerorow] = a->nz;
1235           m = cusparsestruct->nonzerorow;
1236         }
1237 
1238         /* allocate space for the triangular factor information */
1239         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1240         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1241         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1242         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1243 
1244         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1245         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1246         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1247         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1248         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1249         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1250         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1251 
1252         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1253         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1254           /* set the matrix */
1255           CsrMatrix *matrix= new CsrMatrix;
1256           matrix->num_rows = m;
1257           matrix->num_cols = A->cmap->n;
1258           matrix->num_entries = a->nz;
1259           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1260           matrix->row_offsets->assign(ii, ii + m+1);
1261 
1262           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1263           matrix->column_indices->assign(a->j, a->j+a->nz);
1264 
1265           matrix->values = new THRUSTARRAY(a->nz);
1266           matrix->values->assign(a->a, a->a+a->nz);
1267 
1268           /* assign the pointer */
1269           matstruct->mat = matrix;
1270 
1271         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1272 #if CUDA_VERSION>=4020
1273           CsrMatrix *matrix= new CsrMatrix;
1274           matrix->num_rows = m;
1275           matrix->num_cols = A->cmap->n;
1276           matrix->num_entries = a->nz;
1277           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1278           matrix->row_offsets->assign(ii, ii + m+1);
1279 
1280           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1281           matrix->column_indices->assign(a->j, a->j+a->nz);
1282 
1283           matrix->values = new THRUSTARRAY(a->nz);
1284           matrix->values->assign(a->a, a->a+a->nz);
1285 
1286           cusparseHybMat_t hybMat;
1287           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1288           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1289             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1290           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1291               matstruct->descr, matrix->values->data().get(),
1292               matrix->row_offsets->data().get(),
1293               matrix->column_indices->data().get(),
1294               hybMat, 0, partition);CHKERRCUDA(stat);
1295           /* assign the pointer */
1296           matstruct->mat = hybMat;
1297 
1298           if (matrix) {
1299             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1300             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1301             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1302             delete (CsrMatrix*)matrix;
1303           }
1304 #endif
1305         }
1306 
1307         /* assign the compressed row indices */
1308         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1309         matstruct->cprowIndices->assign(ridx,ridx+m);
1310         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1311 
1312         /* assign the pointer */
1313         cusparsestruct->mat = matstruct;
1314 
1315         if (!a->compressedrow.use) {
1316           ierr = PetscFree(ii);CHKERRQ(ierr);
1317           ierr = PetscFree(ridx);CHKERRQ(ierr);
1318         }
1319         cusparsestruct->workVector = new THRUSTARRAY(m);
1320       } catch(char *ex) {
1321         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1322       }
1323       cusparsestruct->nonzerostate = A->nonzerostate;
1324     }
1325     ierr = WaitForGPU();CHKERRCUDA(ierr);
1326     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1327     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1328   }
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 struct VecCUDAPlusEquals
1333 {
1334   template <typename Tuple>
1335   __host__ __device__
1336   void operator()(Tuple t)
1337   {
1338     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1339   }
1340 };
1341 
1342 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1343 {
1344   PetscErrorCode ierr;
1345 
1346   PetscFunctionBegin;
1347   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1348   PetscFunctionReturn(0);
1349 }
1350 
1351 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1352 {
1353   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1354   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1355   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1356   const PetscScalar            *xarray;
1357   PetscScalar                  *yarray;
1358   PetscErrorCode               ierr;
1359   cusparseStatus_t             stat;
1360 
1361   PetscFunctionBegin;
1362   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1363   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1364   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1365   if (!matstructT) {
1366     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1367     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1368   }
1369   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1370   ierr = VecSet(yy,0);CHKERRQ(ierr);
1371   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1372 
1373   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1374     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1375     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1376                              mat->num_rows, mat->num_cols,
1377                              mat->num_entries, matstructT->alpha, matstructT->descr,
1378                              mat->values->data().get(), mat->row_offsets->data().get(),
1379                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1380                              yarray);CHKERRCUDA(stat);
1381   } else {
1382 #if CUDA_VERSION>=4020
1383     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1384     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1385                              matstructT->alpha, matstructT->descr, hybMat,
1386                              xarray, matstructT->beta_zero,
1387                              yarray);CHKERRCUDA(stat);
1388 #endif
1389   }
1390   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1391   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1392   if (!cusparsestruct->stream) {
1393     ierr = WaitForGPU();CHKERRCUDA(ierr);
1394   }
1395   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1396   PetscFunctionReturn(0);
1397 }
1398 
1399 
1400 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1401 {
1402   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1403   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1404   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1405   const PetscScalar            *xarray;
1406   PetscScalar                  *zarray,*dptr,*beta;
1407   PetscErrorCode               ierr;
1408   cusparseStatus_t             stat;
1409 
1410   PetscFunctionBegin;
1411   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1412   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1413   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1414   try {
1415     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1416     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1417     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1418     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1419 
1420     /* csr_spmv is multiply add */
1421     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1422       /* here we need to be careful to set the number of rows in the multiply to the
1423          number of compressed rows in the matrix ... which is equivalent to the
1424          size of the workVector */
1425       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1426       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1427                                mat->num_rows, mat->num_cols,
1428                                mat->num_entries, matstruct->alpha, matstruct->descr,
1429                                mat->values->data().get(), mat->row_offsets->data().get(),
1430                                mat->column_indices->data().get(), xarray, beta,
1431                                dptr);CHKERRCUDA(stat);
1432     } else {
1433 #if CUDA_VERSION>=4020
1434       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1435       if (cusparsestruct->workVector->size()) {
1436         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1437                                  matstruct->alpha, matstruct->descr, hybMat,
1438                                  xarray, beta,
1439                                  dptr);CHKERRCUDA(stat);
1440       }
1441 #endif
1442     }
1443 
1444     if (yy) {
1445       if (dptr != zarray) {
1446         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1447       } else if (zz != yy) {
1448         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1449       }
1450     } else if (dptr != zarray) {
1451       ierr = VecSet(zz,0);CHKERRQ(ierr);
1452     }
1453     /* scatter the data from the temporary into the full vector with a += operation */
1454     if (dptr != zarray) {
1455       thrust::device_ptr<PetscScalar> zptr;
1456 
1457       zptr = thrust::device_pointer_cast(zarray);
1458       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1459                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1460                        VecCUDAPlusEquals());
1461     }
1462     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1463     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1464   } catch(char *ex) {
1465     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1466   }
1467   if (!yy) { /* MatMult */
1468     if (!cusparsestruct->stream) {
1469       ierr = WaitForGPU();CHKERRCUDA(ierr);
1470     }
1471   }
1472   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1477 {
1478   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1479   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1480   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1481   thrust::device_ptr<PetscScalar> zptr;
1482   const PetscScalar               *xarray;
1483   PetscScalar                     *zarray;
1484   PetscErrorCode                  ierr;
1485   cusparseStatus_t                stat;
1486 
1487   PetscFunctionBegin;
1488   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1489   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1490   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1491   if (!matstructT) {
1492     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1493     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1494   }
1495 
1496   try {
1497     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1498     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1499     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1500     zptr = thrust::device_pointer_cast(zarray);
1501 
1502     /* multiply add with matrix transpose */
1503     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1504       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1505       /* here we need to be careful to set the number of rows in the multiply to the
1506          number of compressed rows in the matrix ... which is equivalent to the
1507          size of the workVector */
1508       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1509                                mat->num_rows, mat->num_cols,
1510                                mat->num_entries, matstructT->alpha, matstructT->descr,
1511                                mat->values->data().get(), mat->row_offsets->data().get(),
1512                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1513                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1514     } else {
1515 #if CUDA_VERSION>=4020
1516       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1517       if (cusparsestruct->workVector->size()) {
1518         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1519             matstructT->alpha, matstructT->descr, hybMat,
1520             xarray, matstructT->beta_zero,
1521             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1522       }
1523 #endif
1524     }
1525 
1526     /* scatter the data from the temporary into the full vector with a += operation */
1527     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1528         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1529         VecCUDAPlusEquals());
1530 
1531     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1532     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1533 
1534   } catch(char *ex) {
1535     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1536   }
1537   ierr = WaitForGPU();CHKERRCUDA(ierr);
1538   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1539   PetscFunctionReturn(0);
1540 }
1541 
1542 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1543 {
1544   PetscErrorCode ierr;
1545 
1546   PetscFunctionBegin;
1547   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1548   if (A->factortype==MAT_FACTOR_NONE) {
1549     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1550   }
1551   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1552   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1553   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1554   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1555   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 /* --------------------------------------------------------------------------------*/
1560 /*@
1561    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1562    (the default parallel PETSc format). This matrix will ultimately pushed down
1563    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1564    assembly performance the user should preallocate the matrix storage by setting
1565    the parameter nz (or the array nnz).  By setting these parameters accurately,
1566    performance during matrix assembly can be increased by more than a factor of 50.
1567 
1568    Collective
1569 
1570    Input Parameters:
1571 +  comm - MPI communicator, set to PETSC_COMM_SELF
1572 .  m - number of rows
1573 .  n - number of columns
1574 .  nz - number of nonzeros per row (same for all rows)
1575 -  nnz - array containing the number of nonzeros in the various rows
1576          (possibly different for each row) or NULL
1577 
1578    Output Parameter:
1579 .  A - the matrix
1580 
1581    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1582    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1583    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1584 
1585    Notes:
1586    If nnz is given then nz is ignored
1587 
1588    The AIJ format (also called the Yale sparse matrix format or
1589    compressed row storage), is fully compatible with standard Fortran 77
1590    storage.  That is, the stored row and column indices can begin at
1591    either one (as in Fortran) or zero.  See the users' manual for details.
1592 
1593    Specify the preallocated storage with either nz or nnz (not both).
1594    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1595    allocation.  For large problems you MUST preallocate memory or you
1596    will get TERRIBLE performance, see the users' manual chapter on matrices.
1597 
1598    By default, this format uses inodes (identical nodes) when possible, to
1599    improve numerical efficiency of matrix-vector products and solves. We
1600    search for consecutive rows with the same nonzero structure, thereby
1601    reusing matrix information to achieve increased efficiency.
1602 
1603    Level: intermediate
1604 
1605 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1606 @*/
1607 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1608 {
1609   PetscErrorCode ierr;
1610 
1611   PetscFunctionBegin;
1612   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1613   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1614   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1615   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1616   PetscFunctionReturn(0);
1617 }
1618 
1619 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1620 {
1621   PetscErrorCode   ierr;
1622 
1623   PetscFunctionBegin;
1624   if (A->factortype==MAT_FACTOR_NONE) {
1625     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1626       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1627     }
1628   } else {
1629     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1630   }
1631   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1632   PetscFunctionReturn(0);
1633 }
1634 
1635 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1636 {
1637   PetscErrorCode ierr;
1638   Mat C;
1639   cusparseStatus_t stat;
1640   cusparseHandle_t handle=0;
1641 
1642   PetscFunctionBegin;
1643   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1644   C    = *B;
1645   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1646   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1647 
1648   /* inject CUSPARSE-specific stuff */
1649   if (C->factortype==MAT_FACTOR_NONE) {
1650     /* you cannot check the inode.use flag here since the matrix was just created.
1651        now build a GPU matrix data structure */
1652     C->spptr = new Mat_SeqAIJCUSPARSE;
1653     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1654     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1655     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1656     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1657     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1658     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1659     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1660     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1661   } else {
1662     /* NEXT, set the pointers to the triangular factors */
1663     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1664     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1665     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1666     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1667     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1668     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1669     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1670     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1671     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1672     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1673     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1674     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1675   }
1676 
1677   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1678   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1679   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1680   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1681   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1682   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1683   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1684   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1685 
1686   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1687 
1688   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1689 
1690   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1695 {
1696   PetscErrorCode ierr;
1697   cusparseStatus_t stat;
1698   cusparseHandle_t handle=0;
1699 
1700   PetscFunctionBegin;
1701   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1702   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1703 
1704   if (B->factortype==MAT_FACTOR_NONE) {
1705     /* you cannot check the inode.use flag here since the matrix was just created.
1706        now build a GPU matrix data structure */
1707     B->spptr = new Mat_SeqAIJCUSPARSE;
1708     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1709     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1710     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1711     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1712     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1713     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1714     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1715     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1716   } else {
1717     /* NEXT, set the pointers to the triangular factors */
1718     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1719     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1726     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1727     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1728     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1729   }
1730 
1731   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1732   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1733   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1734   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1735   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1736   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1737   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1738   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1739 
1740   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1741 
1742   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1743 
1744   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1745   PetscFunctionReturn(0);
1746 }
1747 
1748 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1749 {
1750   PetscErrorCode ierr;
1751 
1752   PetscFunctionBegin;
1753   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1754   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 /*MC
1759    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1760 
1761    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1762    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1763    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1764 
1765    Options Database Keys:
1766 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1767 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1768 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1769 
1770   Level: beginner
1771 
1772 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1773 M*/
1774 
1775 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1776 
1777 
1778 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1779 {
1780   PetscErrorCode ierr;
1781 
1782   PetscFunctionBegin;
1783   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1784   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1785   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1786   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1787   PetscFunctionReturn(0);
1788 }
1789 
1790 
1791 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1792 {
1793   cusparseStatus_t stat;
1794   cusparseHandle_t handle;
1795 
1796   PetscFunctionBegin;
1797   if (*cusparsestruct) {
1798     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1799     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1800     delete (*cusparsestruct)->workVector;
1801     if (handle = (*cusparsestruct)->handle) {
1802       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1803     }
1804     delete *cusparsestruct;
1805     *cusparsestruct = 0;
1806   }
1807   PetscFunctionReturn(0);
1808 }
1809 
1810 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1811 {
1812   PetscFunctionBegin;
1813   if (*mat) {
1814     delete (*mat)->values;
1815     delete (*mat)->column_indices;
1816     delete (*mat)->row_offsets;
1817     delete *mat;
1818     *mat = 0;
1819   }
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1824 {
1825   cusparseStatus_t stat;
1826   PetscErrorCode   ierr;
1827 
1828   PetscFunctionBegin;
1829   if (*trifactor) {
1830     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1831     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1832     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1833     delete *trifactor;
1834     *trifactor = 0;
1835   }
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1840 {
1841   CsrMatrix        *mat;
1842   cusparseStatus_t stat;
1843   cudaError_t      err;
1844 
1845   PetscFunctionBegin;
1846   if (*matstruct) {
1847     if ((*matstruct)->mat) {
1848       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1849         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1850         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1851       } else {
1852         mat = (CsrMatrix*)(*matstruct)->mat;
1853         CsrMatrix_Destroy(&mat);
1854       }
1855     }
1856     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1857     delete (*matstruct)->cprowIndices;
1858     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1859     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1860     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1861     delete *matstruct;
1862     *matstruct = 0;
1863   }
1864   PetscFunctionReturn(0);
1865 }
1866 
1867 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1868 {
1869   cusparseHandle_t handle;
1870   cusparseStatus_t stat;
1871 
1872   PetscFunctionBegin;
1873   if (*trifactors) {
1874     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1875     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1876     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1877     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1878     delete (*trifactors)->rpermIndices;
1879     delete (*trifactors)->cpermIndices;
1880     delete (*trifactors)->workVector;
1881     if (handle = (*trifactors)->handle) {
1882       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1883     }
1884     delete *trifactors;
1885     *trifactors = 0;
1886   }
1887   PetscFunctionReturn(0);
1888 }
1889 
1890