xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4fc747eaadbeca11629f314a99edccbc2ed7b3d3)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 #undef __FUNCT__
42 #define __FUNCT__ "MatCUSPARSESetStream"
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCUSPARSESetHandle"
56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
57 {
58   cusparseStatus_t   stat;
59   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60 
61   PetscFunctionBegin;
62   if (cusparsestruct->handle) {
63     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
64   }
65   cusparsestruct->handle = handle;
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 #undef __FUNCT__
71 #define __FUNCT__ "MatCUSPARSEClearHandle"
72 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
73 {
74   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
75   PetscFunctionBegin;
76   if (cusparsestruct->handle)
77     cusparsestruct->handle = 0;
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
83 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 #undef __FUNCT__
104 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse"
105 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
106 {
107   PetscErrorCode ierr;
108   PetscInt       n = A->rmap->n;
109 
110   PetscFunctionBegin;
111   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
112   (*B)->factortype = ftype;
113   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
114   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
115 
116   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
117     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
118     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
119     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
120   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
121     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
122     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
123   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
124 
125   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
126   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
127   PetscFunctionReturn(0);
128 }
129 
130 #undef __FUNCT__
131 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
132 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
133 {
134   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
135 
136   PetscFunctionBegin;
137 #if CUDA_VERSION>=4020
138   switch (op) {
139   case MAT_CUSPARSE_MULT:
140     cusparsestruct->format = format;
141     break;
142   case MAT_CUSPARSE_ALL:
143     cusparsestruct->format = format;
144     break;
145   default:
146     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
147   }
148 #else
149   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
150 #endif
151   PetscFunctionReturn(0);
152 }
153 
154 /*@
155    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
156    operation. Only the MatMult operation can use different GPU storage formats
157    for MPIAIJCUSPARSE matrices.
158    Not Collective
159 
160    Input Parameters:
161 +  A - Matrix of type SEQAIJCUSPARSE
162 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
163 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
164 
165    Output Parameter:
166 
167    Level: intermediate
168 
169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
170 @*/
171 #undef __FUNCT__
172 #define __FUNCT__ "MatCUSPARSESetFormat"
173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   PetscErrorCode ierr;
176 
177   PetscFunctionBegin;
178   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
179   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
180   PetscFunctionReturn(0);
181 }
182 
183 #undef __FUNCT__
184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
186 {
187   PetscErrorCode           ierr;
188   MatCUSPARSEStorageFormat format;
189   PetscBool                flg;
190   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
191 
192   PetscFunctionBegin;
193   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
194   ierr = PetscObjectOptionsBegin((PetscObject)A);
195   if (A->factortype==MAT_FACTOR_NONE) {
196     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
197                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
198     if (flg) {
199       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
200     }
201   }
202   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
203                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
204   if (flg) {
205     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
206   }
207   ierr = PetscOptionsEnd();CHKERRQ(ierr);
208   PetscFunctionReturn(0);
209 
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
220   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
221   PetscFunctionReturn(0);
222 }
223 
224 #undef __FUNCT__
225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
227 {
228   PetscErrorCode ierr;
229 
230   PetscFunctionBegin;
231   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
232   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
233   PetscFunctionReturn(0);
234 }
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
244   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
256   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
257   PetscFunctionReturn(0);
258 }
259 
260 #undef __FUNCT__
261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
263 {
264   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
265   PetscInt                          n = A->rmap->n;
266   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
267   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
268   cusparseStatus_t                  stat;
269   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
270   const MatScalar                   *aa = a->a,*v;
271   PetscInt                          *AiLo, *AjLo;
272   PetscScalar                       *AALo;
273   PetscInt                          i,nz, nzLower, offset, rowOffset;
274   PetscErrorCode                    ierr;
275 
276   PetscFunctionBegin;
277   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
278     try {
279       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
280       nzLower=n+ai[n]-ai[1];
281 
282       /* Allocate Space for the lower triangular matrix */
283       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
284       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
285       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
286 
287       /* Fill the lower triangular matrix */
288       AiLo[0]  = (PetscInt) 0;
289       AiLo[n]  = nzLower;
290       AjLo[0]  = (PetscInt) 0;
291       AALo[0]  = (MatScalar) 1.0;
292       v        = aa;
293       vi       = aj;
294       offset   = 1;
295       rowOffset= 1;
296       for (i=1; i<n; i++) {
297         nz = ai[i+1] - ai[i];
298         /* additional 1 for the term on the diagonal */
299         AiLo[i]    = rowOffset;
300         rowOffset += nz+1;
301 
302         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
303         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
304 
305         offset      += nz;
306         AjLo[offset] = (PetscInt) i;
307         AALo[offset] = (MatScalar) 1.0;
308         offset      += 1;
309 
310         v  += nz;
311         vi += nz;
312       }
313 
314       /* allocate space for the triangular factor information */
315       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
316 
317       /* Create the matrix description */
318       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
319       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
320       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
321       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
322       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
323 
324       /* Create the solve analysis information */
325       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
326 
327       /* set the operation */
328       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
329 
330       /* set the matrix */
331       loTriFactor->csrMat = new CsrMatrix;
332       loTriFactor->csrMat->num_rows = n;
333       loTriFactor->csrMat->num_cols = n;
334       loTriFactor->csrMat->num_entries = nzLower;
335 
336       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
337       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
338 
339       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
340       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
341 
342       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
343       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
344 
345       /* perform the solve analysis */
346       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
347                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
348                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
349                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
350 
351       /* assign the pointer. Is this really necessary? */
352       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
353 
354       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
355       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
356       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
357     } catch(char *ex) {
358       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
359     }
360   }
361   PetscFunctionReturn(0);
362 }
363 
364 #undef __FUNCT__
365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiUp, *AjUp;
376   PetscScalar                       *AAUp;
377   PetscInt                          i,nz, nzUpper, offset;
378   PetscErrorCode                    ierr;
379 
380   PetscFunctionBegin;
381   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
382     try {
383       /* next, figure out the number of nonzeros in the upper triangular matrix. */
384       nzUpper = adiag[0]-adiag[n];
385 
386       /* Allocate Space for the upper triangular matrix */
387       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
388       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
389       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
390 
391       /* Fill the upper triangular matrix */
392       AiUp[0]=(PetscInt) 0;
393       AiUp[n]=nzUpper;
394       offset = nzUpper;
395       for (i=n-1; i>=0; i--) {
396         v  = aa + adiag[i+1] + 1;
397         vi = aj + adiag[i+1] + 1;
398 
399         /* number of elements NOT on the diagonal */
400         nz = adiag[i] - adiag[i+1]-1;
401 
402         /* decrement the offset */
403         offset -= (nz+1);
404 
405         /* first, set the diagonal elements */
406         AjUp[offset] = (PetscInt) i;
407         AAUp[offset] = (MatScalar)1./v[nz];
408         AiUp[i]      = AiUp[i+1] - (nz+1);
409 
410         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
411         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
412       }
413 
414       /* allocate space for the triangular factor information */
415       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
416 
417       /* Create the matrix description */
418       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
419       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
420       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
421       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
422       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
423 
424       /* Create the solve analysis information */
425       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
426 
427       /* set the operation */
428       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
429 
430       /* set the matrix */
431       upTriFactor->csrMat = new CsrMatrix;
432       upTriFactor->csrMat->num_rows = n;
433       upTriFactor->csrMat->num_cols = n;
434       upTriFactor->csrMat->num_entries = nzUpper;
435 
436       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
437       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
438 
439       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
440       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
441 
442       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
443       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
444 
445       /* perform the solve analysis */
446       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
447                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
448                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
449                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
450 
451       /* assign the pointer. Is this really necessary? */
452       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
453 
454       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
455       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
456       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
457     } catch(char *ex) {
458       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
459     }
460   }
461   PetscFunctionReturn(0);
462 }
463 
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
467 {
468   PetscErrorCode               ierr;
469   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
470   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
471   IS                           isrow = a->row,iscol = a->icol;
472   PetscBool                    row_identity,col_identity;
473   const PetscInt               *r,*c;
474   PetscInt                     n = A->rmap->n;
475 
476   PetscFunctionBegin;
477   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
478   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
479 
480   cusparseTriFactors->workVector = new THRUSTARRAY;
481   cusparseTriFactors->workVector->resize(n);
482   cusparseTriFactors->nnz=a->nz;
483 
484   A->valid_GPU_matrix = PETSC_CUDA_BOTH;
485   /*lower triangular indices */
486   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
487   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
488   if (!row_identity) {
489     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
490     cusparseTriFactors->rpermIndices->assign(r, r+n);
491   }
492   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
493 
494   /*upper triangular indices */
495   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
496   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
497   if (!col_identity) {
498     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
499     cusparseTriFactors->cpermIndices->assign(c, c+n);
500   }
501   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
502   PetscFunctionReturn(0);
503 }
504 
505 #undef __FUNCT__
506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
508 {
509   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
510   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
511   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
512   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
513   cusparseStatus_t                  stat;
514   PetscErrorCode                    ierr;
515   PetscInt                          *AiUp, *AjUp;
516   PetscScalar                       *AAUp;
517   PetscScalar                       *AALo;
518   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
519   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
520   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
521   const MatScalar                   *aa = b->a,*v;
522 
523   PetscFunctionBegin;
524   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
525     try {
526       /* Allocate Space for the upper triangular matrix */
527       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
528       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
529       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
530       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
531 
532       /* Fill the upper triangular matrix */
533       AiUp[0]=(PetscInt) 0;
534       AiUp[n]=nzUpper;
535       offset = 0;
536       for (i=0; i<n; i++) {
537         /* set the pointers */
538         v  = aa + ai[i];
539         vj = aj + ai[i];
540         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
541 
542         /* first, set the diagonal elements */
543         AjUp[offset] = (PetscInt) i;
544         AAUp[offset] = (MatScalar)1.0/v[nz];
545         AiUp[i]      = offset;
546         AALo[offset] = (MatScalar)1.0/v[nz];
547 
548         offset+=1;
549         if (nz>0) {
550           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
551           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
552           for (j=offset; j<offset+nz; j++) {
553             AAUp[j] = -AAUp[j];
554             AALo[j] = AAUp[j]/v[nz];
555           }
556           offset+=nz;
557         }
558       }
559 
560       /* allocate space for the triangular factor information */
561       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
562 
563       /* Create the matrix description */
564       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
565       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
566       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
567       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
568       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
569 
570       /* Create the solve analysis information */
571       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
572 
573       /* set the operation */
574       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
575 
576       /* set the matrix */
577       upTriFactor->csrMat = new CsrMatrix;
578       upTriFactor->csrMat->num_rows = A->rmap->n;
579       upTriFactor->csrMat->num_cols = A->cmap->n;
580       upTriFactor->csrMat->num_entries = a->nz;
581 
582       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
583       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
584 
585       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
586       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
587 
588       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
589       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
590 
591       /* perform the solve analysis */
592       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
596 
597       /* assign the pointer. Is this really necessary? */
598       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599 
600       /* allocate space for the triangular factor information */
601       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
602 
603       /* Create the matrix description */
604       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
605       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
606       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
607       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
608       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
609 
610       /* Create the solve analysis information */
611       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
612 
613       /* set the operation */
614       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
615 
616       /* set the matrix */
617       loTriFactor->csrMat = new CsrMatrix;
618       loTriFactor->csrMat->num_rows = A->rmap->n;
619       loTriFactor->csrMat->num_cols = A->cmap->n;
620       loTriFactor->csrMat->num_entries = a->nz;
621 
622       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
623       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
624 
625       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
626       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
627 
628       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
629       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
630 
631       /* perform the solve analysis */
632       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
633                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
634                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
635                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
636 
637       /* assign the pointer. Is this really necessary? */
638       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
639 
640       A->valid_GPU_matrix = PETSC_CUDA_BOTH;
641       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
642       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
643       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
644       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
645     } catch(char *ex) {
646       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
647     }
648   }
649   PetscFunctionReturn(0);
650 }
651 
652 #undef __FUNCT__
653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
655 {
656   PetscErrorCode               ierr;
657   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
658   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
659   IS                           ip = a->row;
660   const PetscInt               *rip;
661   PetscBool                    perm_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
666   cusparseTriFactors->workVector = new THRUSTARRAY;
667   cusparseTriFactors->workVector->resize(n);
668   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
669 
670   /*lower triangular indices */
671   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
672   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
673   if (!perm_identity) {
674     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
676     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
677     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
678   }
679   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
680   PetscFunctionReturn(0);
681 }
682 
683 #undef __FUNCT__
684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
686 {
687   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
688   IS             isrow = b->row,iscol = b->col;
689   PetscBool      row_identity,col_identity;
690   PetscErrorCode ierr;
691 
692   PetscFunctionBegin;
693   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
694   /* determine which version of MatSolve needs to be used. */
695   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
696   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
697   if (row_identity && col_identity) {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
700   } else {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
703   }
704 
705   /* get the triangular factors */
706   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 #undef __FUNCT__
711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
713 {
714   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
715   IS             ip = b->row;
716   PetscBool      perm_identity;
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
721 
722   /* determine which version of MatSolve needs to be used. */
723   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
724   if (perm_identity) {
725     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
726     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
727   } else {
728     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
729     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
730   }
731 
732   /* get the triangular factors */
733   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
740 {
741   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
743   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
744   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
745   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
746   cusparseStatus_t                  stat;
747   cusparseIndexBase_t               indexBase;
748   cusparseMatrixType_t              matrixType;
749   cusparseFillMode_t                fillMode;
750   cusparseDiagType_t                diagType;
751 
752   PetscFunctionBegin;
753 
754   /*********************************************/
755   /* Now the Transpose of the Lower Tri Factor */
756   /*********************************************/
757 
758   /* allocate space for the transpose of the lower triangular factor */
759   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
760 
761   /* set the matrix descriptors of the lower triangular factor */
762   matrixType = cusparseGetMatType(loTriFactor->descr);
763   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
764   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
765     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
766   diagType = cusparseGetMatDiagType(loTriFactor->descr);
767 
768   /* Create the matrix description */
769   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
770   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
771   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
772   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
773   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
774 
775   /* Create the solve analysis information */
776   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
777 
778   /* set the operation */
779   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
780 
781   /* allocate GPU space for the CSC of the lower triangular factor*/
782   loTriFactorT->csrMat = new CsrMatrix;
783   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
784   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
785   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
786   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
787   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
788   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
789 
790   /* compute the transpose of the lower triangular factor, i.e. the CSC */
791   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
792                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
793                           loTriFactor->csrMat->values->data().get(),
794                           loTriFactor->csrMat->row_offsets->data().get(),
795                           loTriFactor->csrMat->column_indices->data().get(),
796                           loTriFactorT->csrMat->values->data().get(),
797                           loTriFactorT->csrMat->column_indices->data().get(),
798                           loTriFactorT->csrMat->row_offsets->data().get(),
799                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
800 
801   /* perform the solve analysis on the transposed matrix */
802   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
803                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
804                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
805                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
806                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
807 
808   /* assign the pointer. Is this really necessary? */
809   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
810 
811   /*********************************************/
812   /* Now the Transpose of the Upper Tri Factor */
813   /*********************************************/
814 
815   /* allocate space for the transpose of the upper triangular factor */
816   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
817 
818   /* set the matrix descriptors of the upper triangular factor */
819   matrixType = cusparseGetMatType(upTriFactor->descr);
820   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
821   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
822     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
823   diagType = cusparseGetMatDiagType(upTriFactor->descr);
824 
825   /* Create the matrix description */
826   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
827   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
828   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
829   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
830   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
831 
832   /* Create the solve analysis information */
833   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
834 
835   /* set the operation */
836   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
837 
838   /* allocate GPU space for the CSC of the upper triangular factor*/
839   upTriFactorT->csrMat = new CsrMatrix;
840   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
841   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
842   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
843   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
844   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
845   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
846 
847   /* compute the transpose of the upper triangular factor, i.e. the CSC */
848   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
849                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
850                           upTriFactor->csrMat->values->data().get(),
851                           upTriFactor->csrMat->row_offsets->data().get(),
852                           upTriFactor->csrMat->column_indices->data().get(),
853                           upTriFactorT->csrMat->values->data().get(),
854                           upTriFactorT->csrMat->column_indices->data().get(),
855                           upTriFactorT->csrMat->row_offsets->data().get(),
856                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
857 
858   /* perform the solve analysis on the transposed matrix */
859   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
860                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
861                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
862                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
863                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
864 
865   /* assign the pointer. Is this really necessary? */
866   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
867   PetscFunctionReturn(0);
868 }
869 
870 #undef __FUNCT__
871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
873 {
874   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
875   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
876   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
877   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
878   cusparseStatus_t             stat;
879   cusparseIndexBase_t          indexBase;
880   cudaError_t                  err;
881 
882   PetscFunctionBegin;
883 
884   /* allocate space for the triangular factor information */
885   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
886   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
887   indexBase = cusparseGetMatIndexBase(matstruct->descr);
888   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
889   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
890 
891   /* set alpha and beta */
892   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
893   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
894   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
895   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
896   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
897 
898   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
899     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
900     CsrMatrix *matrixT= new CsrMatrix;
901     matrixT->num_rows = A->rmap->n;
902     matrixT->num_cols = A->cmap->n;
903     matrixT->num_entries = a->nz;
904     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
906     matrixT->values = new THRUSTARRAY(a->nz);
907 
908     /* compute the transpose of the upper triangular factor, i.e. the CSC */
909     indexBase = cusparseGetMatIndexBase(matstruct->descr);
910     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
911                             matrix->num_cols, matrix->num_entries,
912                             matrix->values->data().get(),
913                             matrix->row_offsets->data().get(),
914                             matrix->column_indices->data().get(),
915                             matrixT->values->data().get(),
916                             matrixT->column_indices->data().get(),
917                             matrixT->row_offsets->data().get(),
918                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
919 
920     /* assign the pointer */
921     matstructT->mat = matrixT;
922 
923   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
924 #if CUDA_VERSION>=5000
925     /* First convert HYB to CSR */
926     CsrMatrix *temp= new CsrMatrix;
927     temp->num_rows = A->rmap->n;
928     temp->num_cols = A->cmap->n;
929     temp->num_entries = a->nz;
930     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
931     temp->column_indices = new THRUSTINTARRAY32(a->nz);
932     temp->values = new THRUSTARRAY(a->nz);
933 
934 
935     stat = cusparse_hyb2csr(cusparsestruct->handle,
936                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
937                             temp->values->data().get(),
938                             temp->row_offsets->data().get(),
939                             temp->column_indices->data().get());CHKERRCUDA(stat);
940 
941     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
942     CsrMatrix *tempT= new CsrMatrix;
943     tempT->num_rows = A->rmap->n;
944     tempT->num_cols = A->cmap->n;
945     tempT->num_entries = a->nz;
946     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
947     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
948     tempT->values = new THRUSTARRAY(a->nz);
949 
950     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
951                             temp->num_cols, temp->num_entries,
952                             temp->values->data().get(),
953                             temp->row_offsets->data().get(),
954                             temp->column_indices->data().get(),
955                             tempT->values->data().get(),
956                             tempT->column_indices->data().get(),
957                             tempT->row_offsets->data().get(),
958                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
959 
960     /* Last, convert CSC to HYB */
961     cusparseHybMat_t hybMat;
962     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
963     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
964       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
965     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
966                             matstructT->descr, tempT->values->data().get(),
967                             tempT->row_offsets->data().get(),
968                             tempT->column_indices->data().get(),
969                             hybMat, 0, partition);CHKERRCUDA(stat);
970 
971     /* assign the pointer */
972     matstructT->mat = hybMat;
973 
974     /* delete temporaries */
975     if (tempT) {
976       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
977       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
978       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
979       delete (CsrMatrix*) tempT;
980     }
981     if (temp) {
982       if (temp->values) delete (THRUSTARRAY*) temp->values;
983       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
984       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
985       delete (CsrMatrix*) temp;
986     }
987 #else
988     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
989 #endif
990   }
991   /* assign the compressed row indices */
992   matstructT->cprowIndices = new THRUSTINTARRAY;
993 
994   /* assign the pointer */
995   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
996   PetscFunctionReturn(0);
997 }
998 
999 #undef __FUNCT__
1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1002 {
1003   PetscInt                              n = xx->map->n;
1004   const PetscScalar                     *barray;
1005   PetscScalar                           *xarray;
1006   thrust::device_ptr<const PetscScalar> bGPU;
1007   thrust::device_ptr<PetscScalar>       xGPU;
1008   cusparseStatus_t                      stat;
1009   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1010   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1011   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1012   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1013   PetscErrorCode                        ierr;
1014 
1015   PetscFunctionBegin;
1016   /* Analyze the matrix and create the transpose ... on the fly */
1017   if (!loTriFactorT && !upTriFactorT) {
1018     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1019     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1020     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1021   }
1022 
1023   /* Get the GPU pointers */
1024   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1025   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1026   xGPU = thrust::device_pointer_cast(xarray);
1027   bGPU = thrust::device_pointer_cast(barray);
1028 
1029   /* First, reorder with the row permutation */
1030   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1031                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1032                xGPU);
1033 
1034   /* First, solve U */
1035   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1036                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1037                         upTriFactorT->csrMat->values->data().get(),
1038                         upTriFactorT->csrMat->row_offsets->data().get(),
1039                         upTriFactorT->csrMat->column_indices->data().get(),
1040                         upTriFactorT->solveInfo,
1041                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1042 
1043   /* Then, solve L */
1044   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1045                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1046                         loTriFactorT->csrMat->values->data().get(),
1047                         loTriFactorT->csrMat->row_offsets->data().get(),
1048                         loTriFactorT->csrMat->column_indices->data().get(),
1049                         loTriFactorT->solveInfo,
1050                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1051 
1052   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1053   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1054                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1055                tempGPU->begin());
1056 
1057   /* Copy the temporary to the full solution. */
1058   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1059 
1060   /* restore */
1061   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1062   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1063   ierr = WaitForGPU();CHKERRCUDA(ierr);
1064 
1065   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 #undef __FUNCT__
1070 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1071 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1072 {
1073   const PetscScalar                 *barray;
1074   PetscScalar                       *xarray;
1075   cusparseStatus_t                  stat;
1076   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1077   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1080   PetscErrorCode                    ierr;
1081 
1082   PetscFunctionBegin;
1083   /* Analyze the matrix and create the transpose ... on the fly */
1084   if (!loTriFactorT && !upTriFactorT) {
1085     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1086     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1087     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1088   }
1089 
1090   /* Get the GPU pointers */
1091   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1092   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1093 
1094   /* First, solve U */
1095   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1096                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1097                         upTriFactorT->csrMat->values->data().get(),
1098                         upTriFactorT->csrMat->row_offsets->data().get(),
1099                         upTriFactorT->csrMat->column_indices->data().get(),
1100                         upTriFactorT->solveInfo,
1101                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1102 
1103   /* Then, solve L */
1104   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1105                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1106                         loTriFactorT->csrMat->values->data().get(),
1107                         loTriFactorT->csrMat->row_offsets->data().get(),
1108                         loTriFactorT->csrMat->column_indices->data().get(),
1109                         loTriFactorT->solveInfo,
1110                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1111 
1112   /* restore */
1113   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1114   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1115   ierr = WaitForGPU();CHKERRCUDA(ierr);
1116   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 #undef __FUNCT__
1121 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1122 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1123 {
1124   const PetscScalar                     *barray;
1125   PetscScalar                           *xarray;
1126   thrust::device_ptr<const PetscScalar> bGPU;
1127   thrust::device_ptr<PetscScalar>       xGPU;
1128   cusparseStatus_t                      stat;
1129   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1130   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1131   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1132   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1133   PetscErrorCode                        ierr;
1134   VecType                               t;
1135   PetscBool                             flg;
1136 
1137   PetscFunctionBegin;
1138   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1139   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1140   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA);
1141   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1142   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1143   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA);
1144 
1145   /* Get the GPU pointers */
1146   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1147   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1148   xGPU = thrust::device_pointer_cast(xarray);
1149   bGPU = thrust::device_pointer_cast(barray);
1150 
1151   /* First, reorder with the row permutation */
1152   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1153                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1154                xGPU);
1155 
1156   /* Next, solve L */
1157   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1158                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1159                         loTriFactor->csrMat->values->data().get(),
1160                         loTriFactor->csrMat->row_offsets->data().get(),
1161                         loTriFactor->csrMat->column_indices->data().get(),
1162                         loTriFactor->solveInfo,
1163                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1164 
1165   /* Then, solve U */
1166   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1167                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1168                         upTriFactor->csrMat->values->data().get(),
1169                         upTriFactor->csrMat->row_offsets->data().get(),
1170                         upTriFactor->csrMat->column_indices->data().get(),
1171                         upTriFactor->solveInfo,
1172                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1173 
1174   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1175   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1176                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1177                tempGPU->begin());
1178 
1179   /* Copy the temporary to the full solution. */
1180   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1181 
1182   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1183   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1184   ierr = WaitForGPU();CHKERRCUDA(ierr);
1185   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1186   PetscFunctionReturn(0);
1187 }
1188 
1189 #undef __FUNCT__
1190 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1191 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1192 {
1193   const PetscScalar                 *barray;
1194   PetscScalar                       *xarray;
1195   cusparseStatus_t                  stat;
1196   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1197   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1198   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1199   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1200   PetscErrorCode                    ierr;
1201 
1202   PetscFunctionBegin;
1203   /* Get the GPU pointers */
1204   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1205   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1206 
1207   /* First, solve L */
1208   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1209                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1210                         loTriFactor->csrMat->values->data().get(),
1211                         loTriFactor->csrMat->row_offsets->data().get(),
1212                         loTriFactor->csrMat->column_indices->data().get(),
1213                         loTriFactor->solveInfo,
1214                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1215 
1216   /* Next, solve U */
1217   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1218                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1219                         upTriFactor->csrMat->values->data().get(),
1220                         upTriFactor->csrMat->row_offsets->data().get(),
1221                         upTriFactor->csrMat->column_indices->data().get(),
1222                         upTriFactor->solveInfo,
1223                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1224 
1225   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1226   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1227   ierr = WaitForGPU();CHKERRCUDA(ierr);
1228   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 #undef __FUNCT__
1233 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1234 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1235 {
1236 
1237   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1238   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1239   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1240   PetscInt                     m = A->rmap->n,*ii,*ridx;
1241   PetscErrorCode               ierr;
1242   cusparseStatus_t             stat;
1243   cudaError_t                  err;
1244 
1245   PetscFunctionBegin;
1246   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1247     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1248     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1249     try {
1250       cusparsestruct->nonzerorow=0;
1251       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1252 
1253       if (a->compressedrow.use) {
1254         m    = a->compressedrow.nrows;
1255         ii   = a->compressedrow.i;
1256         ridx = a->compressedrow.rindex;
1257       } else {
1258         /* Forcing compressed row on the GPU */
1259         int k=0;
1260         ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1261         ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1262         ii[0]=0;
1263         for (int j = 0; j<m; j++) {
1264           if ((a->i[j+1]-a->i[j])>0) {
1265             ii[k]  = a->i[j];
1266             ridx[k]= j;
1267             k++;
1268           }
1269         }
1270         ii[cusparsestruct->nonzerorow] = a->nz;
1271         m = cusparsestruct->nonzerorow;
1272       }
1273 
1274       /* allocate space for the triangular factor information */
1275       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1276       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1277       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1278       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1279 
1280       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1281       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1282       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1283       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1284       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1285 
1286       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1287       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1288 /* set the matrix */
1289         CsrMatrix *matrix= new CsrMatrix;
1290         matrix->num_rows = m;
1291         matrix->num_cols = A->cmap->n;
1292         matrix->num_entries = a->nz;
1293         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1294         matrix->row_offsets->assign(ii, ii + m+1);
1295 
1296         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1297         matrix->column_indices->assign(a->j, a->j+a->nz);
1298 
1299         matrix->values = new THRUSTARRAY(a->nz);
1300         matrix->values->assign(a->a, a->a+a->nz);
1301 
1302 /* assign the pointer */
1303         matstruct->mat = matrix;
1304 
1305       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1306 #if CUDA_VERSION>=4020
1307         CsrMatrix *matrix= new CsrMatrix;
1308         matrix->num_rows = m;
1309         matrix->num_cols = A->cmap->n;
1310         matrix->num_entries = a->nz;
1311         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1312         matrix->row_offsets->assign(ii, ii + m+1);
1313 
1314         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1315         matrix->column_indices->assign(a->j, a->j+a->nz);
1316 
1317         matrix->values = new THRUSTARRAY(a->nz);
1318         matrix->values->assign(a->a, a->a+a->nz);
1319 
1320         cusparseHybMat_t hybMat;
1321         stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1322         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1323           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1324         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1325                                 matstruct->descr, matrix->values->data().get(),
1326                                 matrix->row_offsets->data().get(),
1327                                 matrix->column_indices->data().get(),
1328                                 hybMat, 0, partition);CHKERRCUDA(stat);
1329         /* assign the pointer */
1330         matstruct->mat = hybMat;
1331 
1332         if (matrix) {
1333           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1334           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1335           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1336           delete (CsrMatrix*)matrix;
1337         }
1338 #endif
1339       }
1340 
1341       /* assign the compressed row indices */
1342       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1343       matstruct->cprowIndices->assign(ridx,ridx+m);
1344 
1345       /* assign the pointer */
1346       cusparsestruct->mat = matstruct;
1347 
1348       if (!a->compressedrow.use) {
1349         ierr = PetscFree(ii);CHKERRQ(ierr);
1350         ierr = PetscFree(ridx);CHKERRQ(ierr);
1351       }
1352       cusparsestruct->workVector = new THRUSTARRAY;
1353       cusparsestruct->workVector->resize(m);
1354     } catch(char *ex) {
1355       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1356     }
1357     ierr = WaitForGPU();CHKERRCUDA(ierr);
1358 
1359     A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1360 
1361     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1362   }
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 #undef __FUNCT__
1367 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1368 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1369 {
1370   PetscErrorCode ierr;
1371   PetscInt rbs,cbs;
1372 
1373   PetscFunctionBegin;
1374   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1375   if (right) {
1376     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1377     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1378     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1379     ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr);
1380     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1381   }
1382   if (left) {
1383     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1384     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1385     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1386     ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr);
1387     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1388   }
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 struct VecCUDAPlusEquals
1393 {
1394   template <typename Tuple>
1395   __host__ __device__
1396   void operator()(Tuple t)
1397   {
1398     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1399   }
1400 };
1401 
1402 #undef __FUNCT__
1403 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1404 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1405 {
1406   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1407   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1408   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1409   const PetscScalar            *xarray;
1410   PetscScalar                  *yarray;
1411   PetscErrorCode               ierr;
1412   cusparseStatus_t             stat;
1413 
1414   PetscFunctionBegin;
1415   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1416      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1417   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1418   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1419   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1420     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1421     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1422                              mat->num_rows, mat->num_cols, mat->num_entries,
1423                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1424                              mat->column_indices->data().get(), xarray, matstruct->beta,
1425                              yarray);CHKERRCUDA(stat);
1426   } else {
1427 #if CUDA_VERSION>=4020
1428     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1429     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1430                              matstruct->alpha, matstruct->descr, hybMat,
1431                              xarray, matstruct->beta,
1432                              yarray);CHKERRCUDA(stat);
1433 #endif
1434   }
1435   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1436   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1437   if (!cusparsestruct->stream) {
1438     ierr = WaitForGPU();CHKERRCUDA(ierr);
1439   }
1440   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1441   PetscFunctionReturn(0);
1442 }
1443 
1444 #undef __FUNCT__
1445 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1446 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1447 {
1448   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1449   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1450   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1451   const PetscScalar            *xarray;
1452   PetscScalar                  *yarray;
1453   PetscErrorCode               ierr;
1454   cusparseStatus_t             stat;
1455 
1456   PetscFunctionBegin;
1457   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1458      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1459   if (!matstructT) {
1460     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1461     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1462   }
1463   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1464   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1465 
1466   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1467     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1468     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1469                              mat->num_rows, mat->num_cols,
1470                              mat->num_entries, matstructT->alpha, matstructT->descr,
1471                              mat->values->data().get(), mat->row_offsets->data().get(),
1472                              mat->column_indices->data().get(), xarray, matstructT->beta,
1473                              yarray);CHKERRCUDA(stat);
1474   } else {
1475 #if CUDA_VERSION>=4020
1476     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1477     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1478                              matstructT->alpha, matstructT->descr, hybMat,
1479                              xarray, matstructT->beta,
1480                              yarray);CHKERRCUDA(stat);
1481 #endif
1482   }
1483   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1484   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1485   if (!cusparsestruct->stream) {
1486     ierr = WaitForGPU();CHKERRCUDA(ierr);
1487   }
1488   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1489   PetscFunctionReturn(0);
1490 }
1491 
1492 
1493 #undef __FUNCT__
1494 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1495 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1496 {
1497   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1498   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1499   Mat_SeqAIJCUSPARSEMultStruct    *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1500   thrust::device_ptr<PetscScalar> zptr;
1501   const PetscScalar               *xarray;
1502   PetscScalar                     *zarray;
1503   PetscErrorCode                  ierr;
1504   cusparseStatus_t                stat;
1505 
1506   PetscFunctionBegin;
1507   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1508      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1509   try {
1510     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1511     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1512     ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1513     zptr = thrust::device_pointer_cast(zarray);
1514 
1515     /* multiply add */
1516     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1517       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1518     /* here we need to be careful to set the number of rows in the multiply to the
1519        number of compressed rows in the matrix ... which is equivalent to the
1520        size of the workVector */
1521       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1522                                mat->num_rows, mat->num_cols,
1523                                mat->num_entries, matstruct->alpha, matstruct->descr,
1524                                mat->values->data().get(), mat->row_offsets->data().get(),
1525                                mat->column_indices->data().get(), xarray, matstruct->beta,
1526                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1527     } else {
1528 #if CUDA_VERSION>=4020
1529       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1530       if (cusparsestruct->workVector->size()) {
1531         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1532             matstruct->alpha, matstruct->descr, hybMat,
1533             xarray, matstruct->beta,
1534             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1535       }
1536 #endif
1537     }
1538 
1539     /* scatter the data from the temporary into the full vector with a += operation */
1540     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1541         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1542         VecCUDAPlusEquals());
1543     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1544     ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1545 
1546   } catch(char *ex) {
1547     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1548   }
1549   ierr = WaitForGPU();CHKERRCUDA(ierr);
1550   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 #undef __FUNCT__
1555 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1556 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1557 {
1558   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1559   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1560   Mat_SeqAIJCUSPARSEMultStruct    *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1561   thrust::device_ptr<PetscScalar> zptr;
1562   const PetscScalar               *xarray;
1563   PetscScalar                     *zarray;
1564   PetscErrorCode                  ierr;
1565   cusparseStatus_t                stat;
1566 
1567   PetscFunctionBegin;
1568   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1569      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1570   if (!matstructT) {
1571     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1572     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1573   }
1574 
1575   try {
1576     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1577     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1578     ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1579     zptr = thrust::device_pointer_cast(zarray);
1580 
1581     /* multiply add with matrix transpose */
1582     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1583       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1584       /* here we need to be careful to set the number of rows in the multiply to the
1585          number of compressed rows in the matrix ... which is equivalent to the
1586          size of the workVector */
1587       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1588                                mat->num_rows, mat->num_cols,
1589                                mat->num_entries, matstructT->alpha, matstructT->descr,
1590                                mat->values->data().get(), mat->row_offsets->data().get(),
1591                                mat->column_indices->data().get(), xarray, matstructT->beta,
1592                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1593     } else {
1594 #if CUDA_VERSION>=4020
1595       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1596       if (cusparsestruct->workVector->size()) {
1597         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1598             matstructT->alpha, matstructT->descr, hybMat,
1599             xarray, matstructT->beta,
1600             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1601       }
1602 #endif
1603     }
1604 
1605     /* scatter the data from the temporary into the full vector with a += operation */
1606     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1607         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1608         VecCUDAPlusEquals());
1609 
1610     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1611     ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1612 
1613   } catch(char *ex) {
1614     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1615   }
1616   ierr = WaitForGPU();CHKERRCUDA(ierr);
1617   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 #undef __FUNCT__
1622 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1623 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1624 {
1625   PetscErrorCode ierr;
1626 
1627   PetscFunctionBegin;
1628   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1629   if (A->factortype==MAT_FACTOR_NONE) {
1630     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1631   }
1632   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1633   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1634   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1635   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1636   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 /* --------------------------------------------------------------------------------*/
1641 #undef __FUNCT__
1642 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1643 /*@
1644    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1645    (the default parallel PETSc format). This matrix will ultimately pushed down
1646    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1647    assembly performance the user should preallocate the matrix storage by setting
1648    the parameter nz (or the array nnz).  By setting these parameters accurately,
1649    performance during matrix assembly can be increased by more than a factor of 50.
1650 
1651    Collective on MPI_Comm
1652 
1653    Input Parameters:
1654 +  comm - MPI communicator, set to PETSC_COMM_SELF
1655 .  m - number of rows
1656 .  n - number of columns
1657 .  nz - number of nonzeros per row (same for all rows)
1658 -  nnz - array containing the number of nonzeros in the various rows
1659          (possibly different for each row) or NULL
1660 
1661    Output Parameter:
1662 .  A - the matrix
1663 
1664    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1665    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1666    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1667 
1668    Notes:
1669    If nnz is given then nz is ignored
1670 
1671    The AIJ format (also called the Yale sparse matrix format or
1672    compressed row storage), is fully compatible with standard Fortran 77
1673    storage.  That is, the stored row and column indices can begin at
1674    either one (as in Fortran) or zero.  See the users' manual for details.
1675 
1676    Specify the preallocated storage with either nz or nnz (not both).
1677    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1678    allocation.  For large problems you MUST preallocate memory or you
1679    will get TERRIBLE performance, see the users' manual chapter on matrices.
1680 
1681    By default, this format uses inodes (identical nodes) when possible, to
1682    improve numerical efficiency of matrix-vector products and solves. We
1683    search for consecutive rows with the same nonzero structure, thereby
1684    reusing matrix information to achieve increased efficiency.
1685 
1686    Level: intermediate
1687 
1688 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1689 @*/
1690 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1691 {
1692   PetscErrorCode ierr;
1693 
1694   PetscFunctionBegin;
1695   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1696   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1697   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1698   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1699   PetscFunctionReturn(0);
1700 }
1701 
1702 #undef __FUNCT__
1703 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1704 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1705 {
1706   PetscErrorCode   ierr;
1707 
1708   PetscFunctionBegin;
1709   if (A->factortype==MAT_FACTOR_NONE) {
1710     if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1711       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1712     }
1713   } else {
1714     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1715   }
1716   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 #undef __FUNCT__
1721 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1722 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1723 {
1724   PetscErrorCode ierr;
1725   cusparseStatus_t stat;
1726   cusparseHandle_t handle=0;
1727 
1728   PetscFunctionBegin;
1729   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1730   if (B->factortype==MAT_FACTOR_NONE) {
1731     /* you cannot check the inode.use flag here since the matrix was just created.
1732        now build a GPU matrix data structure */
1733     B->spptr = new Mat_SeqAIJCUSPARSE;
1734     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1735     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1736     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1737     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1738     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1739     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1740     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1741     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1742     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1743   } else {
1744     /* NEXT, set the pointers to the triangular factors */
1745     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1746     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1747     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1748     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1749     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1750     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1751     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1752     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1753     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1754     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1755     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1756     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1757   }
1758 
1759   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1760   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1761   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1762   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1763   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1764   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1765   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1766   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1767 
1768   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1769 
1770   B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1771 
1772   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1773   PetscFunctionReturn(0);
1774 }
1775 
1776 /*M
1777    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1778 
1779    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1780    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1781    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1782 
1783    Options Database Keys:
1784 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1785 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1786 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1787 
1788   Level: beginner
1789 
1790 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1791 M*/
1792 
1793 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1794 
1795 
1796 #undef __FUNCT__
1797 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE"
1798 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1799 {
1800   PetscErrorCode ierr;
1801 
1802   PetscFunctionBegin;
1803   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1804   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1805   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1806   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1807   PetscFunctionReturn(0);
1808 }
1809 
1810 
1811 #undef __FUNCT__
1812 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1813 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1814 {
1815   cusparseStatus_t stat;
1816   cusparseHandle_t handle;
1817 
1818   PetscFunctionBegin;
1819   if (*cusparsestruct) {
1820     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1821     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1822     delete (*cusparsestruct)->workVector;
1823     if (handle = (*cusparsestruct)->handle) {
1824       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1825     }
1826     delete *cusparsestruct;
1827     *cusparsestruct = 0;
1828   }
1829   PetscFunctionReturn(0);
1830 }
1831 
1832 #undef __FUNCT__
1833 #define __FUNCT__ "CsrMatrix_Destroy"
1834 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1835 {
1836   PetscFunctionBegin;
1837   if (*mat) {
1838     delete (*mat)->values;
1839     delete (*mat)->column_indices;
1840     delete (*mat)->row_offsets;
1841     delete *mat;
1842     *mat = 0;
1843   }
1844   PetscFunctionReturn(0);
1845 }
1846 
1847 #undef __FUNCT__
1848 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1849 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1850 {
1851   cusparseStatus_t stat;
1852   PetscErrorCode   ierr;
1853 
1854   PetscFunctionBegin;
1855   if (*trifactor) {
1856     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1857     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1858     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1859     delete *trifactor;
1860     *trifactor = 0;
1861   }
1862   PetscFunctionReturn(0);
1863 }
1864 
1865 #undef __FUNCT__
1866 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1867 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1868 {
1869   CsrMatrix        *mat;
1870   cusparseStatus_t stat;
1871   cudaError_t      err;
1872 
1873   PetscFunctionBegin;
1874   if (*matstruct) {
1875     if ((*matstruct)->mat) {
1876       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1877         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1878         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1879       } else {
1880         mat = (CsrMatrix*)(*matstruct)->mat;
1881         CsrMatrix_Destroy(&mat);
1882       }
1883     }
1884     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1885     delete (*matstruct)->cprowIndices;
1886     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1887     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1888     delete *matstruct;
1889     *matstruct = 0;
1890   }
1891   PetscFunctionReturn(0);
1892 }
1893 
1894 #undef __FUNCT__
1895 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1896 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1897 {
1898   cusparseHandle_t handle;
1899   cusparseStatus_t stat;
1900 
1901   PetscFunctionBegin;
1902   if (*trifactors) {
1903     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1904     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1905     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1906     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1907     delete (*trifactors)->rpermIndices;
1908     delete (*trifactors)->cpermIndices;
1909     delete (*trifactors)->workVector;
1910     if (handle = (*trifactors)->handle) {
1911       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1912     }
1913     delete *trifactors;
1914     *trifactors = 0;
1915   }
1916   PetscFunctionReturn(0);
1917 }
1918 
1919