xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision bfc4c25e64040512a7cdc7bcb995f785926f51e6)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 #undef __FUNCT__
42 #define __FUNCT__ "MatCUSPARSESetStream"
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCUSPARSESetHandle"
56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
57 {
58   cusparseStatus_t   stat;
59   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60 
61   PetscFunctionBegin;
62   if (cusparsestruct->handle) {
63     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
64   }
65   cusparsestruct->handle = handle;
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 #undef __FUNCT__
71 #define __FUNCT__ "MatCUSPARSEClearHandle"
72 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
73 {
74   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
75   PetscFunctionBegin;
76   if (cusparsestruct->handle)
77     cusparsestruct->handle = 0;
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
83 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 #undef __FUNCT__
104 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse"
105 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
106 {
107   PetscErrorCode ierr;
108   PetscInt       n = A->rmap->n;
109 
110   PetscFunctionBegin;
111   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
112   (*B)->factortype = ftype;
113   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
114   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
115 
116   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
117     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
118     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
119     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
120   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
121     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
122     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
123   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
124 
125   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
126   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
127   PetscFunctionReturn(0);
128 }
129 
130 #undef __FUNCT__
131 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
132 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
133 {
134   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
135 
136   PetscFunctionBegin;
137 #if CUDA_VERSION>=4020
138   switch (op) {
139   case MAT_CUSPARSE_MULT:
140     cusparsestruct->format = format;
141     break;
142   case MAT_CUSPARSE_ALL:
143     cusparsestruct->format = format;
144     break;
145   default:
146     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
147   }
148 #else
149   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
150 #endif
151   PetscFunctionReturn(0);
152 }
153 
154 /*@
155    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
156    operation. Only the MatMult operation can use different GPU storage formats
157    for MPIAIJCUSPARSE matrices.
158    Not Collective
159 
160    Input Parameters:
161 +  A - Matrix of type SEQAIJCUSPARSE
162 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
163 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
164 
165    Output Parameter:
166 
167    Level: intermediate
168 
169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
170 @*/
171 #undef __FUNCT__
172 #define __FUNCT__ "MatCUSPARSESetFormat"
173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   PetscErrorCode ierr;
176 
177   PetscFunctionBegin;
178   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
179   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
180   PetscFunctionReturn(0);
181 }
182 
183 #undef __FUNCT__
184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
186 {
187   PetscErrorCode           ierr;
188   MatCUSPARSEStorageFormat format;
189   PetscBool                flg;
190   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
191 
192   PetscFunctionBegin;
193   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
194   ierr = PetscObjectOptionsBegin((PetscObject)A);
195   if (A->factortype==MAT_FACTOR_NONE) {
196     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
197                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
198     if (flg) {
199       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
200     }
201   }
202   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
203                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
204   if (flg) {
205     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
206   }
207   ierr = PetscOptionsEnd();CHKERRQ(ierr);
208   PetscFunctionReturn(0);
209 
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
220   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
221   PetscFunctionReturn(0);
222 }
223 
224 #undef __FUNCT__
225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
227 {
228   PetscErrorCode ierr;
229 
230   PetscFunctionBegin;
231   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
232   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
233   PetscFunctionReturn(0);
234 }
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
244   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
256   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
257   PetscFunctionReturn(0);
258 }
259 
260 #undef __FUNCT__
261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
263 {
264   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
265   PetscInt                          n = A->rmap->n;
266   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
267   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
268   cusparseStatus_t                  stat;
269   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
270   const MatScalar                   *aa = a->a,*v;
271   PetscInt                          *AiLo, *AjLo;
272   PetscScalar                       *AALo;
273   PetscInt                          i,nz, nzLower, offset, rowOffset;
274   PetscErrorCode                    ierr;
275 
276   PetscFunctionBegin;
277   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
278     try {
279       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
280       nzLower=n+ai[n]-ai[1];
281 
282       /* Allocate Space for the lower triangular matrix */
283       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
284       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
285       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
286 
287       /* Fill the lower triangular matrix */
288       AiLo[0]  = (PetscInt) 0;
289       AiLo[n]  = nzLower;
290       AjLo[0]  = (PetscInt) 0;
291       AALo[0]  = (MatScalar) 1.0;
292       v        = aa;
293       vi       = aj;
294       offset   = 1;
295       rowOffset= 1;
296       for (i=1; i<n; i++) {
297         nz = ai[i+1] - ai[i];
298         /* additional 1 for the term on the diagonal */
299         AiLo[i]    = rowOffset;
300         rowOffset += nz+1;
301 
302         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
303         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
304 
305         offset      += nz;
306         AjLo[offset] = (PetscInt) i;
307         AALo[offset] = (MatScalar) 1.0;
308         offset      += 1;
309 
310         v  += nz;
311         vi += nz;
312       }
313 
314       /* allocate space for the triangular factor information */
315       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
316 
317       /* Create the matrix description */
318       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
319       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
320       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
321       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
322       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
323 
324       /* Create the solve analysis information */
325       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
326 
327       /* set the operation */
328       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
329 
330       /* set the matrix */
331       loTriFactor->csrMat = new CsrMatrix;
332       loTriFactor->csrMat->num_rows = n;
333       loTriFactor->csrMat->num_cols = n;
334       loTriFactor->csrMat->num_entries = nzLower;
335 
336       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
337       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
338 
339       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
340       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
341 
342       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
343       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
344 
345       /* perform the solve analysis */
346       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
347                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
348                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
349                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
350 
351       /* assign the pointer. Is this really necessary? */
352       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
353 
354       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
355       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
356       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
357     } catch(char *ex) {
358       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
359     }
360   }
361   PetscFunctionReturn(0);
362 }
363 
364 #undef __FUNCT__
365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiUp, *AjUp;
376   PetscScalar                       *AAUp;
377   PetscInt                          i,nz, nzUpper, offset;
378   PetscErrorCode                    ierr;
379 
380   PetscFunctionBegin;
381   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
382     try {
383       /* next, figure out the number of nonzeros in the upper triangular matrix. */
384       nzUpper = adiag[0]-adiag[n];
385 
386       /* Allocate Space for the upper triangular matrix */
387       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
388       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
389       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
390 
391       /* Fill the upper triangular matrix */
392       AiUp[0]=(PetscInt) 0;
393       AiUp[n]=nzUpper;
394       offset = nzUpper;
395       for (i=n-1; i>=0; i--) {
396         v  = aa + adiag[i+1] + 1;
397         vi = aj + adiag[i+1] + 1;
398 
399         /* number of elements NOT on the diagonal */
400         nz = adiag[i] - adiag[i+1]-1;
401 
402         /* decrement the offset */
403         offset -= (nz+1);
404 
405         /* first, set the diagonal elements */
406         AjUp[offset] = (PetscInt) i;
407         AAUp[offset] = (MatScalar)1./v[nz];
408         AiUp[i]      = AiUp[i+1] - (nz+1);
409 
410         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
411         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
412       }
413 
414       /* allocate space for the triangular factor information */
415       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
416 
417       /* Create the matrix description */
418       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
419       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
420       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
421       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
422       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
423 
424       /* Create the solve analysis information */
425       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
426 
427       /* set the operation */
428       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
429 
430       /* set the matrix */
431       upTriFactor->csrMat = new CsrMatrix;
432       upTriFactor->csrMat->num_rows = n;
433       upTriFactor->csrMat->num_cols = n;
434       upTriFactor->csrMat->num_entries = nzUpper;
435 
436       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
437       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
438 
439       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
440       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
441 
442       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
443       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
444 
445       /* perform the solve analysis */
446       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
447                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
448                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
449                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
450 
451       /* assign the pointer. Is this really necessary? */
452       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
453 
454       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
455       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
456       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
457     } catch(char *ex) {
458       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
459     }
460   }
461   PetscFunctionReturn(0);
462 }
463 
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
467 {
468   PetscErrorCode               ierr;
469   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
470   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
471   IS                           isrow = a->row,iscol = a->icol;
472   PetscBool                    row_identity,col_identity;
473   const PetscInt               *r,*c;
474   PetscInt                     n = A->rmap->n;
475 
476   PetscFunctionBegin;
477   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
478   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
479 
480   cusparseTriFactors->workVector = new THRUSTARRAY;
481   cusparseTriFactors->workVector->resize(n);
482   cusparseTriFactors->nnz=a->nz;
483 
484   A->valid_GPU_matrix = PETSC_CUDA_BOTH;
485   /*lower triangular indices */
486   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
487   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
488   if (!row_identity) {
489     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
490     cusparseTriFactors->rpermIndices->assign(r, r+n);
491   }
492   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
493 
494   /*upper triangular indices */
495   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
496   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
497   if (!col_identity) {
498     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
499     cusparseTriFactors->cpermIndices->assign(c, c+n);
500   }
501   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
502   PetscFunctionReturn(0);
503 }
504 
505 #undef __FUNCT__
506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
508 {
509   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
510   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
511   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
512   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
513   cusparseStatus_t                  stat;
514   PetscErrorCode                    ierr;
515   PetscInt                          *AiUp, *AjUp;
516   PetscScalar                       *AAUp;
517   PetscScalar                       *AALo;
518   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
519   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
520   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
521   const MatScalar                   *aa = b->a,*v;
522 
523   PetscFunctionBegin;
524   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
525     try {
526       /* Allocate Space for the upper triangular matrix */
527       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
528       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
529       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
530       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
531 
532       /* Fill the upper triangular matrix */
533       AiUp[0]=(PetscInt) 0;
534       AiUp[n]=nzUpper;
535       offset = 0;
536       for (i=0; i<n; i++) {
537         /* set the pointers */
538         v  = aa + ai[i];
539         vj = aj + ai[i];
540         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
541 
542         /* first, set the diagonal elements */
543         AjUp[offset] = (PetscInt) i;
544         AAUp[offset] = (MatScalar)1.0/v[nz];
545         AiUp[i]      = offset;
546         AALo[offset] = (MatScalar)1.0/v[nz];
547 
548         offset+=1;
549         if (nz>0) {
550           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
551           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
552           for (j=offset; j<offset+nz; j++) {
553             AAUp[j] = -AAUp[j];
554             AALo[j] = AAUp[j]/v[nz];
555           }
556           offset+=nz;
557         }
558       }
559 
560       /* allocate space for the triangular factor information */
561       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
562 
563       /* Create the matrix description */
564       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
565       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
566       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
567       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
568       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
569 
570       /* Create the solve analysis information */
571       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
572 
573       /* set the operation */
574       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
575 
576       /* set the matrix */
577       upTriFactor->csrMat = new CsrMatrix;
578       upTriFactor->csrMat->num_rows = A->rmap->n;
579       upTriFactor->csrMat->num_cols = A->cmap->n;
580       upTriFactor->csrMat->num_entries = a->nz;
581 
582       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
583       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
584 
585       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
586       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
587 
588       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
589       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
590 
591       /* perform the solve analysis */
592       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
596 
597       /* assign the pointer. Is this really necessary? */
598       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599 
600       /* allocate space for the triangular factor information */
601       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
602 
603       /* Create the matrix description */
604       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
605       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
606       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
607       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
608       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
609 
610       /* Create the solve analysis information */
611       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
612 
613       /* set the operation */
614       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
615 
616       /* set the matrix */
617       loTriFactor->csrMat = new CsrMatrix;
618       loTriFactor->csrMat->num_rows = A->rmap->n;
619       loTriFactor->csrMat->num_cols = A->cmap->n;
620       loTriFactor->csrMat->num_entries = a->nz;
621 
622       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
623       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
624 
625       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
626       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
627 
628       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
629       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
630 
631       /* perform the solve analysis */
632       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
633                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
634                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
635                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
636 
637       /* assign the pointer. Is this really necessary? */
638       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
639 
640       A->valid_GPU_matrix = PETSC_CUDA_BOTH;
641       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
642       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
643       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
644       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
645     } catch(char *ex) {
646       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
647     }
648   }
649   PetscFunctionReturn(0);
650 }
651 
652 #undef __FUNCT__
653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
655 {
656   PetscErrorCode               ierr;
657   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
658   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
659   IS                           ip = a->row;
660   const PetscInt               *rip;
661   PetscBool                    perm_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
666   cusparseTriFactors->workVector = new THRUSTARRAY;
667   cusparseTriFactors->workVector->resize(n);
668   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
669 
670   /*lower triangular indices */
671   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
672   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
673   if (!perm_identity) {
674     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
676     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
677     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
678   }
679   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
680   PetscFunctionReturn(0);
681 }
682 
683 #undef __FUNCT__
684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
686 {
687   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
688   IS             isrow = b->row,iscol = b->col;
689   PetscBool      row_identity,col_identity;
690   PetscErrorCode ierr;
691 
692   PetscFunctionBegin;
693   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
694   /* determine which version of MatSolve needs to be used. */
695   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
696   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
697   if (row_identity && col_identity) {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
700   } else {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
703   }
704 
705   /* get the triangular factors */
706   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 #undef __FUNCT__
711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
713 {
714   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
715   IS             ip = b->row;
716   PetscBool      perm_identity;
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
721 
722   /* determine which version of MatSolve needs to be used. */
723   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
724   if (perm_identity) {
725     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
726     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
727   } else {
728     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
729     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
730   }
731 
732   /* get the triangular factors */
733   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
740 {
741   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
743   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
744   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
745   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
746   cusparseStatus_t                  stat;
747   cusparseIndexBase_t               indexBase;
748   cusparseMatrixType_t              matrixType;
749   cusparseFillMode_t                fillMode;
750   cusparseDiagType_t                diagType;
751 
752   PetscFunctionBegin;
753 
754   /*********************************************/
755   /* Now the Transpose of the Lower Tri Factor */
756   /*********************************************/
757 
758   /* allocate space for the transpose of the lower triangular factor */
759   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
760 
761   /* set the matrix descriptors of the lower triangular factor */
762   matrixType = cusparseGetMatType(loTriFactor->descr);
763   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
764   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
765     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
766   diagType = cusparseGetMatDiagType(loTriFactor->descr);
767 
768   /* Create the matrix description */
769   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
770   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
771   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
772   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
773   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
774 
775   /* Create the solve analysis information */
776   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
777 
778   /* set the operation */
779   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
780 
781   /* allocate GPU space for the CSC of the lower triangular factor*/
782   loTriFactorT->csrMat = new CsrMatrix;
783   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
784   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
785   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
786   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
787   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
788   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
789 
790   /* compute the transpose of the lower triangular factor, i.e. the CSC */
791   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
792                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
793                           loTriFactor->csrMat->values->data().get(),
794                           loTriFactor->csrMat->row_offsets->data().get(),
795                           loTriFactor->csrMat->column_indices->data().get(),
796                           loTriFactorT->csrMat->values->data().get(),
797                           loTriFactorT->csrMat->column_indices->data().get(),
798                           loTriFactorT->csrMat->row_offsets->data().get(),
799                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
800 
801   /* perform the solve analysis on the transposed matrix */
802   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
803                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
804                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
805                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
806                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
807 
808   /* assign the pointer. Is this really necessary? */
809   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
810 
811   /*********************************************/
812   /* Now the Transpose of the Upper Tri Factor */
813   /*********************************************/
814 
815   /* allocate space for the transpose of the upper triangular factor */
816   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
817 
818   /* set the matrix descriptors of the upper triangular factor */
819   matrixType = cusparseGetMatType(upTriFactor->descr);
820   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
821   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
822     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
823   diagType = cusparseGetMatDiagType(upTriFactor->descr);
824 
825   /* Create the matrix description */
826   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
827   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
828   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
829   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
830   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
831 
832   /* Create the solve analysis information */
833   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
834 
835   /* set the operation */
836   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
837 
838   /* allocate GPU space for the CSC of the upper triangular factor*/
839   upTriFactorT->csrMat = new CsrMatrix;
840   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
841   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
842   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
843   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
844   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
845   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
846 
847   /* compute the transpose of the upper triangular factor, i.e. the CSC */
848   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
849                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
850                           upTriFactor->csrMat->values->data().get(),
851                           upTriFactor->csrMat->row_offsets->data().get(),
852                           upTriFactor->csrMat->column_indices->data().get(),
853                           upTriFactorT->csrMat->values->data().get(),
854                           upTriFactorT->csrMat->column_indices->data().get(),
855                           upTriFactorT->csrMat->row_offsets->data().get(),
856                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
857 
858   /* perform the solve analysis on the transposed matrix */
859   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
860                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
861                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
862                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
863                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
864 
865   /* assign the pointer. Is this really necessary? */
866   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
867   PetscFunctionReturn(0);
868 }
869 
870 #undef __FUNCT__
871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
873 {
874   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
875   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
876   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
877   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
878   cusparseStatus_t             stat;
879   cusparseIndexBase_t          indexBase;
880   cudaError_t                  err;
881 
882   PetscFunctionBegin;
883 
884   /* allocate space for the triangular factor information */
885   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
886   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
887   indexBase = cusparseGetMatIndexBase(matstruct->descr);
888   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
889   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
890 
891   /* set alpha and beta */
892   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
893   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
894   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
895   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
896   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
897 
898   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
899     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
900     CsrMatrix *matrixT= new CsrMatrix;
901     matrixT->num_rows = A->rmap->n;
902     matrixT->num_cols = A->cmap->n;
903     matrixT->num_entries = a->nz;
904     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
906     matrixT->values = new THRUSTARRAY(a->nz);
907 
908     /* compute the transpose of the upper triangular factor, i.e. the CSC */
909     indexBase = cusparseGetMatIndexBase(matstruct->descr);
910     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
911                             matrix->num_cols, matrix->num_entries,
912                             matrix->values->data().get(),
913                             matrix->row_offsets->data().get(),
914                             matrix->column_indices->data().get(),
915                             matrixT->values->data().get(),
916                             matrixT->column_indices->data().get(),
917                             matrixT->row_offsets->data().get(),
918                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
919 
920     /* assign the pointer */
921     matstructT->mat = matrixT;
922 
923   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
924 #if CUDA_VERSION>=5000
925     /* First convert HYB to CSR */
926     CsrMatrix *temp= new CsrMatrix;
927     temp->num_rows = A->rmap->n;
928     temp->num_cols = A->cmap->n;
929     temp->num_entries = a->nz;
930     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
931     temp->column_indices = new THRUSTINTARRAY32(a->nz);
932     temp->values = new THRUSTARRAY(a->nz);
933 
934 
935     stat = cusparse_hyb2csr(cusparsestruct->handle,
936                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
937                             temp->values->data().get(),
938                             temp->row_offsets->data().get(),
939                             temp->column_indices->data().get());CHKERRCUDA(stat);
940 
941     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
942     CsrMatrix *tempT= new CsrMatrix;
943     tempT->num_rows = A->rmap->n;
944     tempT->num_cols = A->cmap->n;
945     tempT->num_entries = a->nz;
946     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
947     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
948     tempT->values = new THRUSTARRAY(a->nz);
949 
950     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
951                             temp->num_cols, temp->num_entries,
952                             temp->values->data().get(),
953                             temp->row_offsets->data().get(),
954                             temp->column_indices->data().get(),
955                             tempT->values->data().get(),
956                             tempT->column_indices->data().get(),
957                             tempT->row_offsets->data().get(),
958                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
959 
960     /* Last, convert CSC to HYB */
961     cusparseHybMat_t hybMat;
962     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
963     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
964       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
965     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
966                             matstructT->descr, tempT->values->data().get(),
967                             tempT->row_offsets->data().get(),
968                             tempT->column_indices->data().get(),
969                             hybMat, 0, partition);CHKERRCUDA(stat);
970 
971     /* assign the pointer */
972     matstructT->mat = hybMat;
973 
974     /* delete temporaries */
975     if (tempT) {
976       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
977       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
978       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
979       delete (CsrMatrix*) tempT;
980     }
981     if (temp) {
982       if (temp->values) delete (THRUSTARRAY*) temp->values;
983       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
984       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
985       delete (CsrMatrix*) temp;
986     }
987 #else
988     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
989 #endif
990   }
991   /* assign the compressed row indices */
992   matstructT->cprowIndices = new THRUSTINTARRAY;
993 
994   /* assign the pointer */
995   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
996   PetscFunctionReturn(0);
997 }
998 
999 #undef __FUNCT__
1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1002 {
1003   PetscInt                              n = xx->map->n;
1004   const PetscScalar                     *barray;
1005   PetscScalar                           *xarray;
1006   thrust::device_ptr<const PetscScalar> bGPU;
1007   thrust::device_ptr<PetscScalar>       xGPU;
1008   cusparseStatus_t                      stat;
1009   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1010   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1011   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1012   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1013   PetscErrorCode                        ierr;
1014 
1015   PetscFunctionBegin;
1016   /* Analyze the matrix and create the transpose ... on the fly */
1017   if (!loTriFactorT && !upTriFactorT) {
1018     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1019     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1020     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1021   }
1022 
1023   /* Get the GPU pointers */
1024   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1025   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1026   xGPU = thrust::device_pointer_cast(xarray);
1027   bGPU = thrust::device_pointer_cast(barray);
1028 
1029   /* First, reorder with the row permutation */
1030   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1031                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1032                xGPU);
1033 
1034   /* First, solve U */
1035   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1036                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1037                         upTriFactorT->csrMat->values->data().get(),
1038                         upTriFactorT->csrMat->row_offsets->data().get(),
1039                         upTriFactorT->csrMat->column_indices->data().get(),
1040                         upTriFactorT->solveInfo,
1041                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1042 
1043   /* Then, solve L */
1044   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1045                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1046                         loTriFactorT->csrMat->values->data().get(),
1047                         loTriFactorT->csrMat->row_offsets->data().get(),
1048                         loTriFactorT->csrMat->column_indices->data().get(),
1049                         loTriFactorT->solveInfo,
1050                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1051 
1052   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1053   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1054                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1055                tempGPU->begin());
1056 
1057   /* Copy the temporary to the full solution. */
1058   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1059 
1060   /* restore */
1061   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1062   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1063   ierr = WaitForGPU();CHKERRCUDA(ierr);
1064 
1065   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 #undef __FUNCT__
1070 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1071 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1072 {
1073   const PetscScalar                 *barray;
1074   PetscScalar                       *xarray;
1075   cusparseStatus_t                  stat;
1076   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1077   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1080   PetscErrorCode                    ierr;
1081 
1082   PetscFunctionBegin;
1083   /* Analyze the matrix and create the transpose ... on the fly */
1084   if (!loTriFactorT && !upTriFactorT) {
1085     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1086     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1087     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1088   }
1089 
1090   /* Get the GPU pointers */
1091   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1092   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1093 
1094   /* First, solve U */
1095   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1096                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1097                         upTriFactorT->csrMat->values->data().get(),
1098                         upTriFactorT->csrMat->row_offsets->data().get(),
1099                         upTriFactorT->csrMat->column_indices->data().get(),
1100                         upTriFactorT->solveInfo,
1101                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1102 
1103   /* Then, solve L */
1104   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1105                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1106                         loTriFactorT->csrMat->values->data().get(),
1107                         loTriFactorT->csrMat->row_offsets->data().get(),
1108                         loTriFactorT->csrMat->column_indices->data().get(),
1109                         loTriFactorT->solveInfo,
1110                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1111 
1112   /* restore */
1113   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1114   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1115   ierr = WaitForGPU();CHKERRCUDA(ierr);
1116   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 #undef __FUNCT__
1121 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1122 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1123 {
1124   const PetscScalar                     *barray;
1125   PetscScalar                           *xarray;
1126   thrust::device_ptr<const PetscScalar> bGPU;
1127   thrust::device_ptr<PetscScalar>       xGPU;
1128   cusparseStatus_t                      stat;
1129   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1130   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1131   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1132   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1133   PetscErrorCode                        ierr;
1134   VecType                               t;
1135   PetscBool                             flg;
1136 
1137   PetscFunctionBegin;
1138   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1139   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1140   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA);
1141   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1142   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1143   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA);
1144 
1145   /* Get the GPU pointers */
1146   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1147   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1148   xGPU = thrust::device_pointer_cast(xarray);
1149   bGPU = thrust::device_pointer_cast(barray);
1150 
1151   /* First, reorder with the row permutation */
1152   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1153                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1154                xGPU);
1155 
1156   /* Next, solve L */
1157   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1158                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1159                         loTriFactor->csrMat->values->data().get(),
1160                         loTriFactor->csrMat->row_offsets->data().get(),
1161                         loTriFactor->csrMat->column_indices->data().get(),
1162                         loTriFactor->solveInfo,
1163                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1164 
1165   /* Then, solve U */
1166   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1167                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1168                         upTriFactor->csrMat->values->data().get(),
1169                         upTriFactor->csrMat->row_offsets->data().get(),
1170                         upTriFactor->csrMat->column_indices->data().get(),
1171                         upTriFactor->solveInfo,
1172                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1173 
1174   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1175   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1176                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1177                tempGPU->begin());
1178 
1179   /* Copy the temporary to the full solution. */
1180   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1181 
1182   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1183   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1184   ierr = WaitForGPU();CHKERRCUDA(ierr);
1185   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1186   PetscFunctionReturn(0);
1187 }
1188 
1189 #undef __FUNCT__
1190 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1191 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1192 {
1193   const PetscScalar                 *barray;
1194   PetscScalar                       *xarray;
1195   cusparseStatus_t                  stat;
1196   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1197   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1198   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1199   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1200   PetscErrorCode                    ierr;
1201 
1202   PetscFunctionBegin;
1203   /* Get the GPU pointers */
1204   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1205   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1206 
1207   /* First, solve L */
1208   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1209                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1210                         loTriFactor->csrMat->values->data().get(),
1211                         loTriFactor->csrMat->row_offsets->data().get(),
1212                         loTriFactor->csrMat->column_indices->data().get(),
1213                         loTriFactor->solveInfo,
1214                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1215 
1216   /* Next, solve U */
1217   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1218                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1219                         upTriFactor->csrMat->values->data().get(),
1220                         upTriFactor->csrMat->row_offsets->data().get(),
1221                         upTriFactor->csrMat->column_indices->data().get(),
1222                         upTriFactor->solveInfo,
1223                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1224 
1225   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1226   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1227   ierr = WaitForGPU();CHKERRCUDA(ierr);
1228   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 #undef __FUNCT__
1233 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1234 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1235 {
1236 
1237   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1238   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1239   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1240   PetscInt                     m = A->rmap->n,*ii,*ridx;
1241   PetscErrorCode               ierr;
1242   cusparseStatus_t             stat;
1243   cudaError_t                  err;
1244 
1245   PetscFunctionBegin;
1246   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1247     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1248     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1249       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1250       /* copy values only */
1251       matrix->values->assign(a->a, a->a+a->nz);
1252     } else {
1253       Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1254       try {
1255         cusparsestruct->nonzerorow=0;
1256         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1257 
1258         if (a->compressedrow.use) {
1259           m    = a->compressedrow.nrows;
1260           ii   = a->compressedrow.i;
1261           ridx = a->compressedrow.rindex;
1262         } else {
1263           /* Forcing compressed row on the GPU */
1264           int k=0;
1265           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1266           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1267           ii[0]=0;
1268           for (int j = 0; j<m; j++) {
1269             if ((a->i[j+1]-a->i[j])>0) {
1270               ii[k]  = a->i[j];
1271               ridx[k]= j;
1272               k++;
1273             }
1274           }
1275           ii[cusparsestruct->nonzerorow] = a->nz;
1276           m = cusparsestruct->nonzerorow;
1277         }
1278 
1279         /* allocate space for the triangular factor information */
1280         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1281         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1282         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1283         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1284 
1285         err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1286         err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1287         err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1288         err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1289         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1290 
1291         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1292         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1293           /* set the matrix */
1294           CsrMatrix *matrix= new CsrMatrix;
1295           matrix->num_rows = m;
1296           matrix->num_cols = A->cmap->n;
1297           matrix->num_entries = a->nz;
1298           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1299           matrix->row_offsets->assign(ii, ii + m+1);
1300 
1301           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1302           matrix->column_indices->assign(a->j, a->j+a->nz);
1303 
1304           matrix->values = new THRUSTARRAY(a->nz);
1305           matrix->values->assign(a->a, a->a+a->nz);
1306 
1307           /* assign the pointer */
1308           matstruct->mat = matrix;
1309 
1310         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1311 #if CUDA_VERSION>=4020
1312           CsrMatrix *matrix= new CsrMatrix;
1313           matrix->num_rows = m;
1314           matrix->num_cols = A->cmap->n;
1315           matrix->num_entries = a->nz;
1316           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1317           matrix->row_offsets->assign(ii, ii + m+1);
1318 
1319           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1320           matrix->column_indices->assign(a->j, a->j+a->nz);
1321 
1322           matrix->values = new THRUSTARRAY(a->nz);
1323           matrix->values->assign(a->a, a->a+a->nz);
1324 
1325           cusparseHybMat_t hybMat;
1326           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1327           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1328             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1329           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1330               matstruct->descr, matrix->values->data().get(),
1331               matrix->row_offsets->data().get(),
1332               matrix->column_indices->data().get(),
1333               hybMat, 0, partition);CHKERRCUDA(stat);
1334           /* assign the pointer */
1335           matstruct->mat = hybMat;
1336 
1337           if (matrix) {
1338             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1339             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1340             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1341             delete (CsrMatrix*)matrix;
1342           }
1343 #endif
1344         }
1345 
1346         /* assign the compressed row indices */
1347         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1348         matstruct->cprowIndices->assign(ridx,ridx+m);
1349 
1350         /* assign the pointer */
1351         cusparsestruct->mat = matstruct;
1352 
1353         if (!a->compressedrow.use) {
1354           ierr = PetscFree(ii);CHKERRQ(ierr);
1355           ierr = PetscFree(ridx);CHKERRQ(ierr);
1356         }
1357         cusparsestruct->workVector = new THRUSTARRAY;
1358         cusparsestruct->workVector->resize(m);
1359       } catch(char *ex) {
1360         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1361       }
1362       cusparsestruct->nonzerostate = A->nonzerostate;
1363     }
1364     ierr = WaitForGPU();CHKERRCUDA(ierr);
1365     A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1366     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1367   }
1368   PetscFunctionReturn(0);
1369 }
1370 
1371 #undef __FUNCT__
1372 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1373 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1374 {
1375   PetscErrorCode ierr;
1376   PetscInt rbs,cbs;
1377 
1378   PetscFunctionBegin;
1379   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1380   if (right) {
1381     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1382     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1383     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1384     ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr);
1385     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1386   }
1387   if (left) {
1388     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1389     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1390     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1391     ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr);
1392     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1393   }
1394   PetscFunctionReturn(0);
1395 }
1396 
1397 struct VecCUDAPlusEquals
1398 {
1399   template <typename Tuple>
1400   __host__ __device__
1401   void operator()(Tuple t)
1402   {
1403     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1404   }
1405 };
1406 
1407 #undef __FUNCT__
1408 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1409 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1410 {
1411   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1412   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1413   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1414   const PetscScalar            *xarray;
1415   PetscScalar                  *yarray;
1416   PetscErrorCode               ierr;
1417   cusparseStatus_t             stat;
1418 
1419   PetscFunctionBegin;
1420   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1421   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1422   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1423   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1424   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1425     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1426     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1427                              mat->num_rows, mat->num_cols, mat->num_entries,
1428                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1429                              mat->column_indices->data().get(), xarray, matstruct->beta,
1430                              yarray);CHKERRCUDA(stat);
1431   } else {
1432 #if CUDA_VERSION>=4020
1433     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1434     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1435                              matstruct->alpha, matstruct->descr, hybMat,
1436                              xarray, matstruct->beta,
1437                              yarray);CHKERRCUDA(stat);
1438 #endif
1439   }
1440   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1441   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1442   if (!cusparsestruct->stream) {
1443     ierr = WaitForGPU();CHKERRCUDA(ierr);
1444   }
1445   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 #undef __FUNCT__
1450 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1451 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1452 {
1453   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1454   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1455   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1456   const PetscScalar            *xarray;
1457   PetscScalar                  *yarray;
1458   PetscErrorCode               ierr;
1459   cusparseStatus_t             stat;
1460 
1461   PetscFunctionBegin;
1462   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1463   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1464   if (!matstructT) {
1465     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1466     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1467   }
1468   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1469   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1470 
1471   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1472     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1473     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1474                              mat->num_rows, mat->num_cols,
1475                              mat->num_entries, matstructT->alpha, matstructT->descr,
1476                              mat->values->data().get(), mat->row_offsets->data().get(),
1477                              mat->column_indices->data().get(), xarray, matstructT->beta,
1478                              yarray);CHKERRCUDA(stat);
1479   } else {
1480 #if CUDA_VERSION>=4020
1481     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1482     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1483                              matstructT->alpha, matstructT->descr, hybMat,
1484                              xarray, matstructT->beta,
1485                              yarray);CHKERRCUDA(stat);
1486 #endif
1487   }
1488   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1489   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1490   if (!cusparsestruct->stream) {
1491     ierr = WaitForGPU();CHKERRCUDA(ierr);
1492   }
1493   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 
1498 #undef __FUNCT__
1499 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1500 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1501 {
1502   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1503   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1504   Mat_SeqAIJCUSPARSEMultStruct    *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1505   thrust::device_ptr<PetscScalar> zptr;
1506   const PetscScalar               *xarray;
1507   PetscScalar                     *zarray;
1508   PetscErrorCode                  ierr;
1509   cusparseStatus_t                stat;
1510 
1511   PetscFunctionBegin;
1512   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1513   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1514   try {
1515     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1516     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1517     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1518     zptr = thrust::device_pointer_cast(zarray);
1519 
1520     /* multiply add */
1521     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1522       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1523     /* here we need to be careful to set the number of rows in the multiply to the
1524        number of compressed rows in the matrix ... which is equivalent to the
1525        size of the workVector */
1526       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1527                                mat->num_rows, mat->num_cols,
1528                                mat->num_entries, matstruct->alpha, matstruct->descr,
1529                                mat->values->data().get(), mat->row_offsets->data().get(),
1530                                mat->column_indices->data().get(), xarray, matstruct->beta,
1531                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1532     } else {
1533 #if CUDA_VERSION>=4020
1534       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1535       if (cusparsestruct->workVector->size()) {
1536         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1537             matstruct->alpha, matstruct->descr, hybMat,
1538             xarray, matstruct->beta,
1539             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1540       }
1541 #endif
1542     }
1543 
1544     /* scatter the data from the temporary into the full vector with a += operation */
1545     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1546         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1547         VecCUDAPlusEquals());
1548     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1549     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1550 
1551   } catch(char *ex) {
1552     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1553   }
1554   ierr = WaitForGPU();CHKERRCUDA(ierr);
1555   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 #undef __FUNCT__
1560 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1561 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1562 {
1563   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1564   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1565   Mat_SeqAIJCUSPARSEMultStruct    *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1566   thrust::device_ptr<PetscScalar> zptr;
1567   const PetscScalar               *xarray;
1568   PetscScalar                     *zarray;
1569   PetscErrorCode                  ierr;
1570   cusparseStatus_t                stat;
1571 
1572   PetscFunctionBegin;
1573   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1574   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1575   if (!matstructT) {
1576     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1577     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1578   }
1579 
1580   try {
1581     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1582     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1583     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1584     zptr = thrust::device_pointer_cast(zarray);
1585 
1586     /* multiply add with matrix transpose */
1587     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1588       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1589       /* here we need to be careful to set the number of rows in the multiply to the
1590          number of compressed rows in the matrix ... which is equivalent to the
1591          size of the workVector */
1592       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1593                                mat->num_rows, mat->num_cols,
1594                                mat->num_entries, matstructT->alpha, matstructT->descr,
1595                                mat->values->data().get(), mat->row_offsets->data().get(),
1596                                mat->column_indices->data().get(), xarray, matstructT->beta,
1597                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1598     } else {
1599 #if CUDA_VERSION>=4020
1600       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1601       if (cusparsestruct->workVector->size()) {
1602         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1603             matstructT->alpha, matstructT->descr, hybMat,
1604             xarray, matstructT->beta,
1605             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1606       }
1607 #endif
1608     }
1609 
1610     /* scatter the data from the temporary into the full vector with a += operation */
1611     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1612         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1613         VecCUDAPlusEquals());
1614 
1615     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1616     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1617 
1618   } catch(char *ex) {
1619     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1620   }
1621   ierr = WaitForGPU();CHKERRCUDA(ierr);
1622   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1623   PetscFunctionReturn(0);
1624 }
1625 
1626 #undef __FUNCT__
1627 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1628 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1629 {
1630   PetscErrorCode ierr;
1631 
1632   PetscFunctionBegin;
1633   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1634   if (A->factortype==MAT_FACTOR_NONE) {
1635     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1636   }
1637   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1638   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1639   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1640   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1641   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1642   PetscFunctionReturn(0);
1643 }
1644 
1645 /* --------------------------------------------------------------------------------*/
1646 #undef __FUNCT__
1647 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1648 /*@
1649    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1650    (the default parallel PETSc format). This matrix will ultimately pushed down
1651    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1652    assembly performance the user should preallocate the matrix storage by setting
1653    the parameter nz (or the array nnz).  By setting these parameters accurately,
1654    performance during matrix assembly can be increased by more than a factor of 50.
1655 
1656    Collective on MPI_Comm
1657 
1658    Input Parameters:
1659 +  comm - MPI communicator, set to PETSC_COMM_SELF
1660 .  m - number of rows
1661 .  n - number of columns
1662 .  nz - number of nonzeros per row (same for all rows)
1663 -  nnz - array containing the number of nonzeros in the various rows
1664          (possibly different for each row) or NULL
1665 
1666    Output Parameter:
1667 .  A - the matrix
1668 
1669    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1670    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1671    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1672 
1673    Notes:
1674    If nnz is given then nz is ignored
1675 
1676    The AIJ format (also called the Yale sparse matrix format or
1677    compressed row storage), is fully compatible with standard Fortran 77
1678    storage.  That is, the stored row and column indices can begin at
1679    either one (as in Fortran) or zero.  See the users' manual for details.
1680 
1681    Specify the preallocated storage with either nz or nnz (not both).
1682    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1683    allocation.  For large problems you MUST preallocate memory or you
1684    will get TERRIBLE performance, see the users' manual chapter on matrices.
1685 
1686    By default, this format uses inodes (identical nodes) when possible, to
1687    improve numerical efficiency of matrix-vector products and solves. We
1688    search for consecutive rows with the same nonzero structure, thereby
1689    reusing matrix information to achieve increased efficiency.
1690 
1691    Level: intermediate
1692 
1693 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1694 @*/
1695 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1696 {
1697   PetscErrorCode ierr;
1698 
1699   PetscFunctionBegin;
1700   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1701   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1702   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1703   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706 
1707 #undef __FUNCT__
1708 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1709 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1710 {
1711   PetscErrorCode   ierr;
1712 
1713   PetscFunctionBegin;
1714   if (A->factortype==MAT_FACTOR_NONE) {
1715     if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1716       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1717     }
1718   } else {
1719     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1720   }
1721   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1722   PetscFunctionReturn(0);
1723 }
1724 
1725 #undef __FUNCT__
1726 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1727 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1728 {
1729   PetscErrorCode ierr;
1730   cusparseStatus_t stat;
1731   cusparseHandle_t handle=0;
1732 
1733   PetscFunctionBegin;
1734   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1735   if (B->factortype==MAT_FACTOR_NONE) {
1736     /* you cannot check the inode.use flag here since the matrix was just created.
1737        now build a GPU matrix data structure */
1738     B->spptr = new Mat_SeqAIJCUSPARSE;
1739     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1740     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1741     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1742     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1743     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1744     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1745     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1746     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1747     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1748   } else {
1749     /* NEXT, set the pointers to the triangular factors */
1750     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1751     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1752     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1753     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1754     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1755     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1756     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1757     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1758     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1759     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1760     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1761     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1762   }
1763 
1764   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1765   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1766   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1767   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1768   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1769   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1770   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1771   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1772 
1773   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1774 
1775   B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1776 
1777   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1778   PetscFunctionReturn(0);
1779 }
1780 
1781 /*M
1782    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1783 
1784    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1785    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1786    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1787 
1788    Options Database Keys:
1789 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1790 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1791 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1792 
1793   Level: beginner
1794 
1795 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1796 M*/
1797 
1798 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1799 
1800 
1801 #undef __FUNCT__
1802 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE"
1803 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1804 {
1805   PetscErrorCode ierr;
1806 
1807   PetscFunctionBegin;
1808   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1809   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1810   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1811   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 
1816 #undef __FUNCT__
1817 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1818 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1819 {
1820   cusparseStatus_t stat;
1821   cusparseHandle_t handle;
1822 
1823   PetscFunctionBegin;
1824   if (*cusparsestruct) {
1825     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1826     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1827     delete (*cusparsestruct)->workVector;
1828     if (handle = (*cusparsestruct)->handle) {
1829       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1830     }
1831     delete *cusparsestruct;
1832     *cusparsestruct = 0;
1833   }
1834   PetscFunctionReturn(0);
1835 }
1836 
1837 #undef __FUNCT__
1838 #define __FUNCT__ "CsrMatrix_Destroy"
1839 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1840 {
1841   PetscFunctionBegin;
1842   if (*mat) {
1843     delete (*mat)->values;
1844     delete (*mat)->column_indices;
1845     delete (*mat)->row_offsets;
1846     delete *mat;
1847     *mat = 0;
1848   }
1849   PetscFunctionReturn(0);
1850 }
1851 
1852 #undef __FUNCT__
1853 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1854 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1855 {
1856   cusparseStatus_t stat;
1857   PetscErrorCode   ierr;
1858 
1859   PetscFunctionBegin;
1860   if (*trifactor) {
1861     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1862     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1863     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1864     delete *trifactor;
1865     *trifactor = 0;
1866   }
1867   PetscFunctionReturn(0);
1868 }
1869 
1870 #undef __FUNCT__
1871 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1872 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1873 {
1874   CsrMatrix        *mat;
1875   cusparseStatus_t stat;
1876   cudaError_t      err;
1877 
1878   PetscFunctionBegin;
1879   if (*matstruct) {
1880     if ((*matstruct)->mat) {
1881       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1882         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1883         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1884       } else {
1885         mat = (CsrMatrix*)(*matstruct)->mat;
1886         CsrMatrix_Destroy(&mat);
1887       }
1888     }
1889     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1890     delete (*matstruct)->cprowIndices;
1891     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1892     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1893     delete *matstruct;
1894     *matstruct = 0;
1895   }
1896   PetscFunctionReturn(0);
1897 }
1898 
1899 #undef __FUNCT__
1900 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1901 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1902 {
1903   cusparseHandle_t handle;
1904   cusparseStatus_t stat;
1905 
1906   PetscFunctionBegin;
1907   if (*trifactors) {
1908     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1909     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1910     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1911     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1912     delete (*trifactors)->rpermIndices;
1913     delete (*trifactors)->cpermIndices;
1914     delete (*trifactors)->workVector;
1915     if (handle = (*trifactors)->handle) {
1916       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1917     }
1918     delete *trifactors;
1919     *trifactors = 0;
1920   }
1921   PetscFunctionReturn(0);
1922 }
1923 
1924