xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 34d6c7a575bdb8fc9e2ffe999767d738671d28b3)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 #undef __FUNCT__
42 #define __FUNCT__ "MatCUSPARSESetStream"
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCUSPARSESetHandle"
56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
57 {
58   cusparseStatus_t   stat;
59   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60 
61   PetscFunctionBegin;
62   if (cusparsestruct->handle) {
63     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
64   }
65   cusparsestruct->handle = handle;
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 #undef __FUNCT__
71 #define __FUNCT__ "MatCUSPARSEClearHandle"
72 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
73 {
74   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
75   PetscFunctionBegin;
76   if (cusparsestruct->handle)
77     cusparsestruct->handle = 0;
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
83 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 #undef __FUNCT__
104 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse"
105 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
106 {
107   PetscErrorCode ierr;
108   PetscInt       n = A->rmap->n;
109 
110   PetscFunctionBegin;
111   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
112   (*B)->factortype = ftype;
113   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
114   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
115 
116   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
117     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
118     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
119     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
120   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
121     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
122     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
123   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
124 
125   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
126   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
127   PetscFunctionReturn(0);
128 }
129 
130 #undef __FUNCT__
131 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
132 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
133 {
134   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
135 
136   PetscFunctionBegin;
137 #if CUDA_VERSION>=4020
138   switch (op) {
139   case MAT_CUSPARSE_MULT:
140     cusparsestruct->format = format;
141     break;
142   case MAT_CUSPARSE_ALL:
143     cusparsestruct->format = format;
144     break;
145   default:
146     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
147   }
148 #else
149   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
150 #endif
151   PetscFunctionReturn(0);
152 }
153 
154 /*@
155    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
156    operation. Only the MatMult operation can use different GPU storage formats
157    for MPIAIJCUSPARSE matrices.
158    Not Collective
159 
160    Input Parameters:
161 +  A - Matrix of type SEQAIJCUSPARSE
162 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
163 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
164 
165    Output Parameter:
166 
167    Level: intermediate
168 
169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
170 @*/
171 #undef __FUNCT__
172 #define __FUNCT__ "MatCUSPARSESetFormat"
173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   PetscErrorCode ierr;
176 
177   PetscFunctionBegin;
178   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
179   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
180   PetscFunctionReturn(0);
181 }
182 
183 #undef __FUNCT__
184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
186 {
187   PetscErrorCode           ierr;
188   MatCUSPARSEStorageFormat format;
189   PetscBool                flg;
190   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
191 
192   PetscFunctionBegin;
193   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
194   ierr = PetscObjectOptionsBegin((PetscObject)A);
195   if (A->factortype==MAT_FACTOR_NONE) {
196     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
197                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
198     if (flg) {
199       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
200     }
201   }
202   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
203                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
204   if (flg) {
205     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
206   }
207   ierr = PetscOptionsEnd();CHKERRQ(ierr);
208   PetscFunctionReturn(0);
209 
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
220   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
221   PetscFunctionReturn(0);
222 }
223 
224 #undef __FUNCT__
225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
227 {
228   PetscErrorCode ierr;
229 
230   PetscFunctionBegin;
231   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
232   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
233   PetscFunctionReturn(0);
234 }
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
244   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
256   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
257   PetscFunctionReturn(0);
258 }
259 
260 #undef __FUNCT__
261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
263 {
264   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
265   PetscInt                          n = A->rmap->n;
266   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
267   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
268   cusparseStatus_t                  stat;
269   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
270   const MatScalar                   *aa = a->a,*v;
271   PetscInt                          *AiLo, *AjLo;
272   PetscScalar                       *AALo;
273   PetscInt                          i,nz, nzLower, offset, rowOffset;
274   PetscErrorCode                    ierr;
275 
276   PetscFunctionBegin;
277   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
278     try {
279       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
280       nzLower=n+ai[n]-ai[1];
281 
282       /* Allocate Space for the lower triangular matrix */
283       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
284       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
285       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
286 
287       /* Fill the lower triangular matrix */
288       AiLo[0]  = (PetscInt) 0;
289       AiLo[n]  = nzLower;
290       AjLo[0]  = (PetscInt) 0;
291       AALo[0]  = (MatScalar) 1.0;
292       v        = aa;
293       vi       = aj;
294       offset   = 1;
295       rowOffset= 1;
296       for (i=1; i<n; i++) {
297         nz = ai[i+1] - ai[i];
298         /* additional 1 for the term on the diagonal */
299         AiLo[i]    = rowOffset;
300         rowOffset += nz+1;
301 
302         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
303         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
304 
305         offset      += nz;
306         AjLo[offset] = (PetscInt) i;
307         AALo[offset] = (MatScalar) 1.0;
308         offset      += 1;
309 
310         v  += nz;
311         vi += nz;
312       }
313 
314       /* allocate space for the triangular factor information */
315       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
316 
317       /* Create the matrix description */
318       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
319       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
320       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
321       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
322       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
323 
324       /* Create the solve analysis information */
325       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
326 
327       /* set the operation */
328       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
329 
330       /* set the matrix */
331       loTriFactor->csrMat = new CsrMatrix;
332       loTriFactor->csrMat->num_rows = n;
333       loTriFactor->csrMat->num_cols = n;
334       loTriFactor->csrMat->num_entries = nzLower;
335 
336       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
337       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
338 
339       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
340       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
341 
342       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
343       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
344 
345       /* perform the solve analysis */
346       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
347                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
348                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
349                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
350 
351       /* assign the pointer. Is this really necessary? */
352       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
353 
354       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
355       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
356       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
357     } catch(char *ex) {
358       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
359     }
360   }
361   PetscFunctionReturn(0);
362 }
363 
364 #undef __FUNCT__
365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiUp, *AjUp;
376   PetscScalar                       *AAUp;
377   PetscInt                          i,nz, nzUpper, offset;
378   PetscErrorCode                    ierr;
379 
380   PetscFunctionBegin;
381   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
382     try {
383       /* next, figure out the number of nonzeros in the upper triangular matrix. */
384       nzUpper = adiag[0]-adiag[n];
385 
386       /* Allocate Space for the upper triangular matrix */
387       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
388       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
389       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
390 
391       /* Fill the upper triangular matrix */
392       AiUp[0]=(PetscInt) 0;
393       AiUp[n]=nzUpper;
394       offset = nzUpper;
395       for (i=n-1; i>=0; i--) {
396         v  = aa + adiag[i+1] + 1;
397         vi = aj + adiag[i+1] + 1;
398 
399         /* number of elements NOT on the diagonal */
400         nz = adiag[i] - adiag[i+1]-1;
401 
402         /* decrement the offset */
403         offset -= (nz+1);
404 
405         /* first, set the diagonal elements */
406         AjUp[offset] = (PetscInt) i;
407         AAUp[offset] = (MatScalar)1./v[nz];
408         AiUp[i]      = AiUp[i+1] - (nz+1);
409 
410         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
411         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
412       }
413 
414       /* allocate space for the triangular factor information */
415       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
416 
417       /* Create the matrix description */
418       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
419       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
420       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
421       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
422       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
423 
424       /* Create the solve analysis information */
425       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
426 
427       /* set the operation */
428       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
429 
430       /* set the matrix */
431       upTriFactor->csrMat = new CsrMatrix;
432       upTriFactor->csrMat->num_rows = n;
433       upTriFactor->csrMat->num_cols = n;
434       upTriFactor->csrMat->num_entries = nzUpper;
435 
436       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
437       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
438 
439       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
440       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
441 
442       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
443       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
444 
445       /* perform the solve analysis */
446       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
447                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
448                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
449                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
450 
451       /* assign the pointer. Is this really necessary? */
452       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
453 
454       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
455       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
456       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
457     } catch(char *ex) {
458       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
459     }
460   }
461   PetscFunctionReturn(0);
462 }
463 
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
467 {
468   PetscErrorCode               ierr;
469   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
470   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
471   IS                           isrow = a->row,iscol = a->icol;
472   PetscBool                    row_identity,col_identity;
473   const PetscInt               *r,*c;
474   PetscInt                     n = A->rmap->n;
475 
476   PetscFunctionBegin;
477   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
478   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
479 
480   cusparseTriFactors->workVector = new THRUSTARRAY;
481   cusparseTriFactors->workVector->resize(n);
482   cusparseTriFactors->nnz=a->nz;
483 
484   A->valid_GPU_matrix = PETSC_CUDA_BOTH;
485   /*lower triangular indices */
486   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
487   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
488   if (!row_identity) {
489     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
490     cusparseTriFactors->rpermIndices->assign(r, r+n);
491   }
492   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
493 
494   /*upper triangular indices */
495   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
496   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
497   if (!col_identity) {
498     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
499     cusparseTriFactors->cpermIndices->assign(c, c+n);
500   }
501   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
502   PetscFunctionReturn(0);
503 }
504 
505 #undef __FUNCT__
506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
508 {
509   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
510   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
511   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
512   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
513   cusparseStatus_t                  stat;
514   PetscErrorCode                    ierr;
515   PetscInt                          *AiUp, *AjUp;
516   PetscScalar                       *AAUp;
517   PetscScalar                       *AALo;
518   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
519   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
520   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
521   const MatScalar                   *aa = b->a,*v;
522 
523   PetscFunctionBegin;
524   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
525     try {
526       /* Allocate Space for the upper triangular matrix */
527       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
528       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
529       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
530       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
531 
532       /* Fill the upper triangular matrix */
533       AiUp[0]=(PetscInt) 0;
534       AiUp[n]=nzUpper;
535       offset = 0;
536       for (i=0; i<n; i++) {
537         /* set the pointers */
538         v  = aa + ai[i];
539         vj = aj + ai[i];
540         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
541 
542         /* first, set the diagonal elements */
543         AjUp[offset] = (PetscInt) i;
544         AAUp[offset] = (MatScalar)1.0/v[nz];
545         AiUp[i]      = offset;
546         AALo[offset] = (MatScalar)1.0/v[nz];
547 
548         offset+=1;
549         if (nz>0) {
550           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
551           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
552           for (j=offset; j<offset+nz; j++) {
553             AAUp[j] = -AAUp[j];
554             AALo[j] = AAUp[j]/v[nz];
555           }
556           offset+=nz;
557         }
558       }
559 
560       /* allocate space for the triangular factor information */
561       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
562 
563       /* Create the matrix description */
564       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
565       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
566       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
567       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
568       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
569 
570       /* Create the solve analysis information */
571       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
572 
573       /* set the operation */
574       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
575 
576       /* set the matrix */
577       upTriFactor->csrMat = new CsrMatrix;
578       upTriFactor->csrMat->num_rows = A->rmap->n;
579       upTriFactor->csrMat->num_cols = A->cmap->n;
580       upTriFactor->csrMat->num_entries = a->nz;
581 
582       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
583       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
584 
585       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
586       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
587 
588       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
589       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
590 
591       /* perform the solve analysis */
592       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
596 
597       /* assign the pointer. Is this really necessary? */
598       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599 
600       /* allocate space for the triangular factor information */
601       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
602 
603       /* Create the matrix description */
604       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
605       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
606       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
607       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
608       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
609 
610       /* Create the solve analysis information */
611       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
612 
613       /* set the operation */
614       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
615 
616       /* set the matrix */
617       loTriFactor->csrMat = new CsrMatrix;
618       loTriFactor->csrMat->num_rows = A->rmap->n;
619       loTriFactor->csrMat->num_cols = A->cmap->n;
620       loTriFactor->csrMat->num_entries = a->nz;
621 
622       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
623       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
624 
625       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
626       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
627 
628       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
629       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
630 
631       /* perform the solve analysis */
632       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
633                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
634                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
635                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
636 
637       /* assign the pointer. Is this really necessary? */
638       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
639 
640       A->valid_GPU_matrix = PETSC_CUDA_BOTH;
641       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
642       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
643       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
644       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
645     } catch(char *ex) {
646       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
647     }
648   }
649   PetscFunctionReturn(0);
650 }
651 
652 #undef __FUNCT__
653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
655 {
656   PetscErrorCode               ierr;
657   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
658   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
659   IS                           ip = a->row;
660   const PetscInt               *rip;
661   PetscBool                    perm_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
666   cusparseTriFactors->workVector = new THRUSTARRAY;
667   cusparseTriFactors->workVector->resize(n);
668   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
669 
670   /*lower triangular indices */
671   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
672   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
673   if (!perm_identity) {
674     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
676     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
677     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
678   }
679   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
680   PetscFunctionReturn(0);
681 }
682 
683 #undef __FUNCT__
684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
686 {
687   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
688   IS             isrow = b->row,iscol = b->col;
689   PetscBool      row_identity,col_identity;
690   PetscErrorCode ierr;
691 
692   PetscFunctionBegin;
693   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
694   /* determine which version of MatSolve needs to be used. */
695   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
696   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
697   if (row_identity && col_identity) {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
700   } else {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
703   }
704 
705   /* get the triangular factors */
706   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 #undef __FUNCT__
711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
713 {
714   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
715   IS             ip = b->row;
716   PetscBool      perm_identity;
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
721 
722   /* determine which version of MatSolve needs to be used. */
723   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
724   if (perm_identity) {
725     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
726     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
727   } else {
728     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
729     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
730   }
731 
732   /* get the triangular factors */
733   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
740 {
741   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
743   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
744   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
745   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
746   cusparseStatus_t                  stat;
747   cusparseIndexBase_t               indexBase;
748   cusparseMatrixType_t              matrixType;
749   cusparseFillMode_t                fillMode;
750   cusparseDiagType_t                diagType;
751 
752   PetscFunctionBegin;
753 
754   /*********************************************/
755   /* Now the Transpose of the Lower Tri Factor */
756   /*********************************************/
757 
758   /* allocate space for the transpose of the lower triangular factor */
759   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
760 
761   /* set the matrix descriptors of the lower triangular factor */
762   matrixType = cusparseGetMatType(loTriFactor->descr);
763   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
764   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
765     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
766   diagType = cusparseGetMatDiagType(loTriFactor->descr);
767 
768   /* Create the matrix description */
769   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
770   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
771   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
772   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
773   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
774 
775   /* Create the solve analysis information */
776   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
777 
778   /* set the operation */
779   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
780 
781   /* allocate GPU space for the CSC of the lower triangular factor*/
782   loTriFactorT->csrMat = new CsrMatrix;
783   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
784   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
785   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
786   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
787   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
788   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
789 
790   /* compute the transpose of the lower triangular factor, i.e. the CSC */
791   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
792                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
793                           loTriFactor->csrMat->values->data().get(),
794                           loTriFactor->csrMat->row_offsets->data().get(),
795                           loTriFactor->csrMat->column_indices->data().get(),
796                           loTriFactorT->csrMat->values->data().get(),
797                           loTriFactorT->csrMat->column_indices->data().get(),
798                           loTriFactorT->csrMat->row_offsets->data().get(),
799                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
800 
801   /* perform the solve analysis on the transposed matrix */
802   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
803                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
804                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
805                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
806                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
807 
808   /* assign the pointer. Is this really necessary? */
809   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
810 
811   /*********************************************/
812   /* Now the Transpose of the Upper Tri Factor */
813   /*********************************************/
814 
815   /* allocate space for the transpose of the upper triangular factor */
816   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
817 
818   /* set the matrix descriptors of the upper triangular factor */
819   matrixType = cusparseGetMatType(upTriFactor->descr);
820   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
821   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
822     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
823   diagType = cusparseGetMatDiagType(upTriFactor->descr);
824 
825   /* Create the matrix description */
826   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
827   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
828   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
829   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
830   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
831 
832   /* Create the solve analysis information */
833   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
834 
835   /* set the operation */
836   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
837 
838   /* allocate GPU space for the CSC of the upper triangular factor*/
839   upTriFactorT->csrMat = new CsrMatrix;
840   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
841   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
842   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
843   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
844   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
845   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
846 
847   /* compute the transpose of the upper triangular factor, i.e. the CSC */
848   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
849                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
850                           upTriFactor->csrMat->values->data().get(),
851                           upTriFactor->csrMat->row_offsets->data().get(),
852                           upTriFactor->csrMat->column_indices->data().get(),
853                           upTriFactorT->csrMat->values->data().get(),
854                           upTriFactorT->csrMat->column_indices->data().get(),
855                           upTriFactorT->csrMat->row_offsets->data().get(),
856                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
857 
858   /* perform the solve analysis on the transposed matrix */
859   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
860                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
861                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
862                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
863                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
864 
865   /* assign the pointer. Is this really necessary? */
866   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
867   PetscFunctionReturn(0);
868 }
869 
870 #undef __FUNCT__
871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
873 {
874   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
875   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
876   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
877   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
878   cusparseStatus_t             stat;
879   cusparseIndexBase_t          indexBase;
880   cudaError_t                  err;
881 
882   PetscFunctionBegin;
883 
884   /* allocate space for the triangular factor information */
885   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
886   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
887   indexBase = cusparseGetMatIndexBase(matstruct->descr);
888   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
889   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
890 
891   /* set alpha and beta */
892   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
893   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
894   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
895   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
896   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
897 
898   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
899     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
900     CsrMatrix *matrixT= new CsrMatrix;
901     matrixT->num_rows = A->rmap->n;
902     matrixT->num_cols = A->cmap->n;
903     matrixT->num_entries = a->nz;
904     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
906     matrixT->values = new THRUSTARRAY(a->nz);
907 
908     /* compute the transpose of the upper triangular factor, i.e. the CSC */
909     indexBase = cusparseGetMatIndexBase(matstruct->descr);
910     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
911                             matrix->num_cols, matrix->num_entries,
912                             matrix->values->data().get(),
913                             matrix->row_offsets->data().get(),
914                             matrix->column_indices->data().get(),
915                             matrixT->values->data().get(),
916                             matrixT->column_indices->data().get(),
917                             matrixT->row_offsets->data().get(),
918                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
919 
920     /* assign the pointer */
921     matstructT->mat = matrixT;
922 
923   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
924 #if CUDA_VERSION>=5000
925     /* First convert HYB to CSR */
926     CsrMatrix *temp= new CsrMatrix;
927     temp->num_rows = A->rmap->n;
928     temp->num_cols = A->cmap->n;
929     temp->num_entries = a->nz;
930     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
931     temp->column_indices = new THRUSTINTARRAY32(a->nz);
932     temp->values = new THRUSTARRAY(a->nz);
933 
934 
935     stat = cusparse_hyb2csr(cusparsestruct->handle,
936                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
937                             temp->values->data().get(),
938                             temp->row_offsets->data().get(),
939                             temp->column_indices->data().get());CHKERRCUDA(stat);
940 
941     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
942     CsrMatrix *tempT= new CsrMatrix;
943     tempT->num_rows = A->rmap->n;
944     tempT->num_cols = A->cmap->n;
945     tempT->num_entries = a->nz;
946     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
947     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
948     tempT->values = new THRUSTARRAY(a->nz);
949 
950     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
951                             temp->num_cols, temp->num_entries,
952                             temp->values->data().get(),
953                             temp->row_offsets->data().get(),
954                             temp->column_indices->data().get(),
955                             tempT->values->data().get(),
956                             tempT->column_indices->data().get(),
957                             tempT->row_offsets->data().get(),
958                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
959 
960     /* Last, convert CSC to HYB */
961     cusparseHybMat_t hybMat;
962     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
963     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
964       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
965     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
966                             matstructT->descr, tempT->values->data().get(),
967                             tempT->row_offsets->data().get(),
968                             tempT->column_indices->data().get(),
969                             hybMat, 0, partition);CHKERRCUDA(stat);
970 
971     /* assign the pointer */
972     matstructT->mat = hybMat;
973 
974     /* delete temporaries */
975     if (tempT) {
976       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
977       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
978       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
979       delete (CsrMatrix*) tempT;
980     }
981     if (temp) {
982       if (temp->values) delete (THRUSTARRAY*) temp->values;
983       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
984       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
985       delete (CsrMatrix*) temp;
986     }
987 #else
988     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
989 #endif
990   }
991   /* assign the compressed row indices */
992   matstructT->cprowIndices = new THRUSTINTARRAY;
993 
994   /* assign the pointer */
995   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
996   PetscFunctionReturn(0);
997 }
998 
999 #undef __FUNCT__
1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1002 {
1003   PetscInt                              n = xx->map->n;
1004   const PetscScalar                     *barray;
1005   PetscScalar                           *xarray;
1006   thrust::device_ptr<const PetscScalar> bGPU;
1007   thrust::device_ptr<PetscScalar>       xGPU;
1008   cusparseStatus_t                      stat;
1009   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1010   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1011   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1012   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1013   PetscErrorCode                        ierr;
1014 
1015   PetscFunctionBegin;
1016   /* Analyze the matrix and create the transpose ... on the fly */
1017   if (!loTriFactorT && !upTriFactorT) {
1018     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1019     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1020     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1021   }
1022 
1023   /* Get the GPU pointers */
1024   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1025   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1026   xGPU = thrust::device_pointer_cast(xarray);
1027   bGPU = thrust::device_pointer_cast(barray);
1028 
1029   /* First, reorder with the row permutation */
1030   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1031                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1032                xGPU);
1033 
1034   /* First, solve U */
1035   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1036                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1037                         upTriFactorT->csrMat->values->data().get(),
1038                         upTriFactorT->csrMat->row_offsets->data().get(),
1039                         upTriFactorT->csrMat->column_indices->data().get(),
1040                         upTriFactorT->solveInfo,
1041                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1042 
1043   /* Then, solve L */
1044   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1045                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1046                         loTriFactorT->csrMat->values->data().get(),
1047                         loTriFactorT->csrMat->row_offsets->data().get(),
1048                         loTriFactorT->csrMat->column_indices->data().get(),
1049                         loTriFactorT->solveInfo,
1050                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1051 
1052   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1053   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1054                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1055                tempGPU->begin());
1056 
1057   /* Copy the temporary to the full solution. */
1058   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1059 
1060   /* restore */
1061   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1062   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1063   ierr = WaitForGPU();CHKERRCUDA(ierr);
1064 
1065   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 #undef __FUNCT__
1070 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1071 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1072 {
1073   const PetscScalar                 *barray;
1074   PetscScalar                       *xarray;
1075   cusparseStatus_t                  stat;
1076   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1077   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1080   PetscErrorCode                    ierr;
1081 
1082   PetscFunctionBegin;
1083   /* Analyze the matrix and create the transpose ... on the fly */
1084   if (!loTriFactorT && !upTriFactorT) {
1085     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1086     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1087     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1088   }
1089 
1090   /* Get the GPU pointers */
1091   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1092   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1093 
1094   /* First, solve U */
1095   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1096                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1097                         upTriFactorT->csrMat->values->data().get(),
1098                         upTriFactorT->csrMat->row_offsets->data().get(),
1099                         upTriFactorT->csrMat->column_indices->data().get(),
1100                         upTriFactorT->solveInfo,
1101                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1102 
1103   /* Then, solve L */
1104   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1105                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1106                         loTriFactorT->csrMat->values->data().get(),
1107                         loTriFactorT->csrMat->row_offsets->data().get(),
1108                         loTriFactorT->csrMat->column_indices->data().get(),
1109                         loTriFactorT->solveInfo,
1110                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1111 
1112   /* restore */
1113   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1114   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1115   ierr = WaitForGPU();CHKERRCUDA(ierr);
1116   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 #undef __FUNCT__
1121 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1122 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1123 {
1124   const PetscScalar                     *barray;
1125   PetscScalar                           *xarray;
1126   thrust::device_ptr<const PetscScalar> bGPU;
1127   thrust::device_ptr<PetscScalar>       xGPU;
1128   cusparseStatus_t                      stat;
1129   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1130   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1131   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1132   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1133   PetscErrorCode                        ierr;
1134   VecType                               t;
1135   PetscBool                             flg;
1136 
1137   PetscFunctionBegin;
1138   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1139   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1140   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA);
1141   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1142   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1143   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA);
1144 
1145   /* Get the GPU pointers */
1146   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1147   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1148   xGPU = thrust::device_pointer_cast(xarray);
1149   bGPU = thrust::device_pointer_cast(barray);
1150 
1151   /* First, reorder with the row permutation */
1152   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1153                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1154                xGPU);
1155 
1156   /* Next, solve L */
1157   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1158                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1159                         loTriFactor->csrMat->values->data().get(),
1160                         loTriFactor->csrMat->row_offsets->data().get(),
1161                         loTriFactor->csrMat->column_indices->data().get(),
1162                         loTriFactor->solveInfo,
1163                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1164 
1165   /* Then, solve U */
1166   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1167                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1168                         upTriFactor->csrMat->values->data().get(),
1169                         upTriFactor->csrMat->row_offsets->data().get(),
1170                         upTriFactor->csrMat->column_indices->data().get(),
1171                         upTriFactor->solveInfo,
1172                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1173 
1174   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1175   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1176                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1177                tempGPU->begin());
1178 
1179   /* Copy the temporary to the full solution. */
1180   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1181 
1182   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1183   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1184   ierr = WaitForGPU();CHKERRCUDA(ierr);
1185   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1186   PetscFunctionReturn(0);
1187 }
1188 
1189 #undef __FUNCT__
1190 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1191 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1192 {
1193   const PetscScalar                 *barray;
1194   PetscScalar                       *xarray;
1195   cusparseStatus_t                  stat;
1196   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1197   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1198   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1199   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1200   PetscErrorCode                    ierr;
1201 
1202   PetscFunctionBegin;
1203   /* Get the GPU pointers */
1204   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1205   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1206 
1207   /* First, solve L */
1208   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1209                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1210                         loTriFactor->csrMat->values->data().get(),
1211                         loTriFactor->csrMat->row_offsets->data().get(),
1212                         loTriFactor->csrMat->column_indices->data().get(),
1213                         loTriFactor->solveInfo,
1214                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1215 
1216   /* Next, solve U */
1217   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1218                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1219                         upTriFactor->csrMat->values->data().get(),
1220                         upTriFactor->csrMat->row_offsets->data().get(),
1221                         upTriFactor->csrMat->column_indices->data().get(),
1222                         upTriFactor->solveInfo,
1223                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1224 
1225   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1226   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1227   ierr = WaitForGPU();CHKERRCUDA(ierr);
1228   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 #undef __FUNCT__
1233 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1234 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1235 {
1236 
1237   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1238   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1239   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1240   PetscInt                     m = A->rmap->n,*ii,*ridx;
1241   PetscErrorCode               ierr;
1242   cusparseStatus_t             stat;
1243   cudaError_t                  err;
1244 
1245   PetscFunctionBegin;
1246   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1247     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1248     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1249       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1250       /* copy values only */
1251       matrix->values->assign(a->a, a->a+a->nz);
1252     } else {
1253       Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1254       try {
1255         cusparsestruct->nonzerorow=0;
1256         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1257 
1258         if (a->compressedrow.use) {
1259           m    = a->compressedrow.nrows;
1260           ii   = a->compressedrow.i;
1261           ridx = a->compressedrow.rindex;
1262         } else {
1263           /* Forcing compressed row on the GPU */
1264           int k=0;
1265           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1266           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1267           ii[0]=0;
1268           for (int j = 0; j<m; j++) {
1269             if ((a->i[j+1]-a->i[j])>0) {
1270               ii[k]  = a->i[j];
1271               ridx[k]= j;
1272               k++;
1273             }
1274           }
1275           ii[cusparsestruct->nonzerorow] = a->nz;
1276           m = cusparsestruct->nonzerorow;
1277         }
1278 
1279         /* allocate space for the triangular factor information */
1280         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1281         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1282         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1283         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1284 
1285         err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1286         err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1287         err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1288         err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1289         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1290 
1291         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1292         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1293           /* set the matrix */
1294           CsrMatrix *matrix= new CsrMatrix;
1295           matrix->num_rows = m;
1296           matrix->num_cols = A->cmap->n;
1297           matrix->num_entries = a->nz;
1298           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1299           matrix->row_offsets->assign(ii, ii + m+1);
1300 
1301           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1302           matrix->column_indices->assign(a->j, a->j+a->nz);
1303 
1304           matrix->values = new THRUSTARRAY(a->nz);
1305           matrix->values->assign(a->a, a->a+a->nz);
1306 
1307           /* assign the pointer */
1308           matstruct->mat = matrix;
1309 
1310         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1311 #if CUDA_VERSION>=4020
1312           CsrMatrix *matrix= new CsrMatrix;
1313           matrix->num_rows = m;
1314           matrix->num_cols = A->cmap->n;
1315           matrix->num_entries = a->nz;
1316           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1317           matrix->row_offsets->assign(ii, ii + m+1);
1318 
1319           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1320           matrix->column_indices->assign(a->j, a->j+a->nz);
1321 
1322           matrix->values = new THRUSTARRAY(a->nz);
1323           matrix->values->assign(a->a, a->a+a->nz);
1324 
1325           cusparseHybMat_t hybMat;
1326           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1327           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1328             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1329           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1330               matstruct->descr, matrix->values->data().get(),
1331               matrix->row_offsets->data().get(),
1332               matrix->column_indices->data().get(),
1333               hybMat, 0, partition);CHKERRCUDA(stat);
1334           /* assign the pointer */
1335           matstruct->mat = hybMat;
1336 
1337           if (matrix) {
1338             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1339             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1340             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1341             delete (CsrMatrix*)matrix;
1342           }
1343 #endif
1344         }
1345 
1346         /* assign the compressed row indices */
1347         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1348         matstruct->cprowIndices->assign(ridx,ridx+m);
1349 
1350         /* assign the pointer */
1351         cusparsestruct->mat = matstruct;
1352 
1353         if (!a->compressedrow.use) {
1354           ierr = PetscFree(ii);CHKERRQ(ierr);
1355           ierr = PetscFree(ridx);CHKERRQ(ierr);
1356         }
1357         cusparsestruct->workVector = new THRUSTARRAY;
1358         cusparsestruct->workVector->resize(m);
1359       } catch(char *ex) {
1360         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1361       }
1362       ierr = WaitForGPU();CHKERRCUDA(ierr);
1363 
1364       A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1365       cusparsestruct->nonzerostate = A->nonzerostate;
1366     }
1367     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1368   }
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 #undef __FUNCT__
1373 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1374 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1375 {
1376   PetscErrorCode ierr;
1377   PetscInt rbs,cbs;
1378 
1379   PetscFunctionBegin;
1380   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1381   if (right) {
1382     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1383     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1384     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1385     ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr);
1386     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1387   }
1388   if (left) {
1389     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1390     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1391     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1392     ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr);
1393     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1394   }
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 struct VecCUDAPlusEquals
1399 {
1400   template <typename Tuple>
1401   __host__ __device__
1402   void operator()(Tuple t)
1403   {
1404     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1405   }
1406 };
1407 
1408 #undef __FUNCT__
1409 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1410 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1411 {
1412   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1413   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1414   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1415   const PetscScalar            *xarray;
1416   PetscScalar                  *yarray;
1417   PetscErrorCode               ierr;
1418   cusparseStatus_t             stat;
1419 
1420   PetscFunctionBegin;
1421   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1422   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1423   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1424   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1425   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1426     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1427     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1428                              mat->num_rows, mat->num_cols, mat->num_entries,
1429                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1430                              mat->column_indices->data().get(), xarray, matstruct->beta,
1431                              yarray);CHKERRCUDA(stat);
1432   } else {
1433 #if CUDA_VERSION>=4020
1434     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1435     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1436                              matstruct->alpha, matstruct->descr, hybMat,
1437                              xarray, matstruct->beta,
1438                              yarray);CHKERRCUDA(stat);
1439 #endif
1440   }
1441   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1442   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1443   if (!cusparsestruct->stream) {
1444     ierr = WaitForGPU();CHKERRCUDA(ierr);
1445   }
1446   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1447   PetscFunctionReturn(0);
1448 }
1449 
1450 #undef __FUNCT__
1451 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1452 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1453 {
1454   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1455   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1456   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1457   const PetscScalar            *xarray;
1458   PetscScalar                  *yarray;
1459   PetscErrorCode               ierr;
1460   cusparseStatus_t             stat;
1461 
1462   PetscFunctionBegin;
1463   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1464   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1465   if (!matstructT) {
1466     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1467     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1468   }
1469   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1470   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1471 
1472   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1473     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1474     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1475                              mat->num_rows, mat->num_cols,
1476                              mat->num_entries, matstructT->alpha, matstructT->descr,
1477                              mat->values->data().get(), mat->row_offsets->data().get(),
1478                              mat->column_indices->data().get(), xarray, matstructT->beta,
1479                              yarray);CHKERRCUDA(stat);
1480   } else {
1481 #if CUDA_VERSION>=4020
1482     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1483     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1484                              matstructT->alpha, matstructT->descr, hybMat,
1485                              xarray, matstructT->beta,
1486                              yarray);CHKERRCUDA(stat);
1487 #endif
1488   }
1489   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1490   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1491   if (!cusparsestruct->stream) {
1492     ierr = WaitForGPU();CHKERRCUDA(ierr);
1493   }
1494   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 
1499 #undef __FUNCT__
1500 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1501 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1502 {
1503   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1504   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1505   Mat_SeqAIJCUSPARSEMultStruct    *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1506   thrust::device_ptr<PetscScalar> zptr;
1507   const PetscScalar               *xarray;
1508   PetscScalar                     *zarray;
1509   PetscErrorCode                  ierr;
1510   cusparseStatus_t                stat;
1511 
1512   PetscFunctionBegin;
1513   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1514   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1515   try {
1516     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1517     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1518     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1519     zptr = thrust::device_pointer_cast(zarray);
1520 
1521     /* multiply add */
1522     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1523       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1524     /* here we need to be careful to set the number of rows in the multiply to the
1525        number of compressed rows in the matrix ... which is equivalent to the
1526        size of the workVector */
1527       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1528                                mat->num_rows, mat->num_cols,
1529                                mat->num_entries, matstruct->alpha, matstruct->descr,
1530                                mat->values->data().get(), mat->row_offsets->data().get(),
1531                                mat->column_indices->data().get(), xarray, matstruct->beta,
1532                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1533     } else {
1534 #if CUDA_VERSION>=4020
1535       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1536       if (cusparsestruct->workVector->size()) {
1537         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1538             matstruct->alpha, matstruct->descr, hybMat,
1539             xarray, matstruct->beta,
1540             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1541       }
1542 #endif
1543     }
1544 
1545     /* scatter the data from the temporary into the full vector with a += operation */
1546     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1547         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1548         VecCUDAPlusEquals());
1549     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1550     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1551 
1552   } catch(char *ex) {
1553     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1554   }
1555   ierr = WaitForGPU();CHKERRCUDA(ierr);
1556   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1557   PetscFunctionReturn(0);
1558 }
1559 
1560 #undef __FUNCT__
1561 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1562 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1563 {
1564   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1565   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1566   Mat_SeqAIJCUSPARSEMultStruct    *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1567   thrust::device_ptr<PetscScalar> zptr;
1568   const PetscScalar               *xarray;
1569   PetscScalar                     *zarray;
1570   PetscErrorCode                  ierr;
1571   cusparseStatus_t                stat;
1572 
1573   PetscFunctionBegin;
1574   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1575   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1576   if (!matstructT) {
1577     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1578     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1579   }
1580 
1581   try {
1582     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1583     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1584     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1585     zptr = thrust::device_pointer_cast(zarray);
1586 
1587     /* multiply add with matrix transpose */
1588     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1589       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1590       /* here we need to be careful to set the number of rows in the multiply to the
1591          number of compressed rows in the matrix ... which is equivalent to the
1592          size of the workVector */
1593       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1594                                mat->num_rows, mat->num_cols,
1595                                mat->num_entries, matstructT->alpha, matstructT->descr,
1596                                mat->values->data().get(), mat->row_offsets->data().get(),
1597                                mat->column_indices->data().get(), xarray, matstructT->beta,
1598                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1599     } else {
1600 #if CUDA_VERSION>=4020
1601       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1602       if (cusparsestruct->workVector->size()) {
1603         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1604             matstructT->alpha, matstructT->descr, hybMat,
1605             xarray, matstructT->beta,
1606             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1607       }
1608 #endif
1609     }
1610 
1611     /* scatter the data from the temporary into the full vector with a += operation */
1612     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1613         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1614         VecCUDAPlusEquals());
1615 
1616     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1617     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1618 
1619   } catch(char *ex) {
1620     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1621   }
1622   ierr = WaitForGPU();CHKERRCUDA(ierr);
1623   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1624   PetscFunctionReturn(0);
1625 }
1626 
1627 #undef __FUNCT__
1628 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1629 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1630 {
1631   PetscErrorCode ierr;
1632 
1633   PetscFunctionBegin;
1634   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1635   if (A->factortype==MAT_FACTOR_NONE) {
1636     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1637   }
1638   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1639   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1640   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1641   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1642   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 /* --------------------------------------------------------------------------------*/
1647 #undef __FUNCT__
1648 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1649 /*@
1650    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1651    (the default parallel PETSc format). This matrix will ultimately pushed down
1652    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1653    assembly performance the user should preallocate the matrix storage by setting
1654    the parameter nz (or the array nnz).  By setting these parameters accurately,
1655    performance during matrix assembly can be increased by more than a factor of 50.
1656 
1657    Collective on MPI_Comm
1658 
1659    Input Parameters:
1660 +  comm - MPI communicator, set to PETSC_COMM_SELF
1661 .  m - number of rows
1662 .  n - number of columns
1663 .  nz - number of nonzeros per row (same for all rows)
1664 -  nnz - array containing the number of nonzeros in the various rows
1665          (possibly different for each row) or NULL
1666 
1667    Output Parameter:
1668 .  A - the matrix
1669 
1670    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1671    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1672    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1673 
1674    Notes:
1675    If nnz is given then nz is ignored
1676 
1677    The AIJ format (also called the Yale sparse matrix format or
1678    compressed row storage), is fully compatible with standard Fortran 77
1679    storage.  That is, the stored row and column indices can begin at
1680    either one (as in Fortran) or zero.  See the users' manual for details.
1681 
1682    Specify the preallocated storage with either nz or nnz (not both).
1683    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1684    allocation.  For large problems you MUST preallocate memory or you
1685    will get TERRIBLE performance, see the users' manual chapter on matrices.
1686 
1687    By default, this format uses inodes (identical nodes) when possible, to
1688    improve numerical efficiency of matrix-vector products and solves. We
1689    search for consecutive rows with the same nonzero structure, thereby
1690    reusing matrix information to achieve increased efficiency.
1691 
1692    Level: intermediate
1693 
1694 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1695 @*/
1696 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1697 {
1698   PetscErrorCode ierr;
1699 
1700   PetscFunctionBegin;
1701   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1702   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1703   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1704   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1705   PetscFunctionReturn(0);
1706 }
1707 
1708 #undef __FUNCT__
1709 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1710 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1711 {
1712   PetscErrorCode   ierr;
1713 
1714   PetscFunctionBegin;
1715   if (A->factortype==MAT_FACTOR_NONE) {
1716     if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1717       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1718     }
1719   } else {
1720     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1721   }
1722   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 #undef __FUNCT__
1727 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1728 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1729 {
1730   PetscErrorCode ierr;
1731   cusparseStatus_t stat;
1732   cusparseHandle_t handle=0;
1733 
1734   PetscFunctionBegin;
1735   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1736   if (B->factortype==MAT_FACTOR_NONE) {
1737     /* you cannot check the inode.use flag here since the matrix was just created.
1738        now build a GPU matrix data structure */
1739     B->spptr = new Mat_SeqAIJCUSPARSE;
1740     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1741     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1742     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1743     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1744     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1745     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1746     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1747     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1748     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1749   } else {
1750     /* NEXT, set the pointers to the triangular factors */
1751     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1752     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1753     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1754     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1755     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1756     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1757     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1758     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1759     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1760     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1761     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1762     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1763   }
1764 
1765   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1766   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1767   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1768   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1769   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1770   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1771   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1772   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1773 
1774   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1775 
1776   B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1777 
1778   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1779   PetscFunctionReturn(0);
1780 }
1781 
1782 /*M
1783    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1784 
1785    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1786    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1787    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1788 
1789    Options Database Keys:
1790 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1791 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1792 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1793 
1794   Level: beginner
1795 
1796 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1797 M*/
1798 
1799 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1800 
1801 
1802 #undef __FUNCT__
1803 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE"
1804 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1805 {
1806   PetscErrorCode ierr;
1807 
1808   PetscFunctionBegin;
1809   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1810   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1811   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1812   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1813   PetscFunctionReturn(0);
1814 }
1815 
1816 
1817 #undef __FUNCT__
1818 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1819 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1820 {
1821   cusparseStatus_t stat;
1822   cusparseHandle_t handle;
1823 
1824   PetscFunctionBegin;
1825   if (*cusparsestruct) {
1826     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1827     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1828     delete (*cusparsestruct)->workVector;
1829     if (handle = (*cusparsestruct)->handle) {
1830       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1831     }
1832     delete *cusparsestruct;
1833     *cusparsestruct = 0;
1834   }
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 #undef __FUNCT__
1839 #define __FUNCT__ "CsrMatrix_Destroy"
1840 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1841 {
1842   PetscFunctionBegin;
1843   if (*mat) {
1844     delete (*mat)->values;
1845     delete (*mat)->column_indices;
1846     delete (*mat)->row_offsets;
1847     delete *mat;
1848     *mat = 0;
1849   }
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 #undef __FUNCT__
1854 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1855 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1856 {
1857   cusparseStatus_t stat;
1858   PetscErrorCode   ierr;
1859 
1860   PetscFunctionBegin;
1861   if (*trifactor) {
1862     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1863     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1864     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1865     delete *trifactor;
1866     *trifactor = 0;
1867   }
1868   PetscFunctionReturn(0);
1869 }
1870 
1871 #undef __FUNCT__
1872 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1873 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1874 {
1875   CsrMatrix        *mat;
1876   cusparseStatus_t stat;
1877   cudaError_t      err;
1878 
1879   PetscFunctionBegin;
1880   if (*matstruct) {
1881     if ((*matstruct)->mat) {
1882       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1883         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1884         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1885       } else {
1886         mat = (CsrMatrix*)(*matstruct)->mat;
1887         CsrMatrix_Destroy(&mat);
1888       }
1889     }
1890     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1891     delete (*matstruct)->cprowIndices;
1892     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1893     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1894     delete *matstruct;
1895     *matstruct = 0;
1896   }
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 #undef __FUNCT__
1901 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1902 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1903 {
1904   cusparseHandle_t handle;
1905   cusparseStatus_t stat;
1906 
1907   PetscFunctionBegin;
1908   if (*trifactors) {
1909     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1910     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1911     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1912     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1913     delete (*trifactors)->rpermIndices;
1914     delete (*trifactors)->cpermIndices;
1915     delete (*trifactors)->workVector;
1916     if (handle = (*trifactors)->handle) {
1917       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1918     }
1919     delete *trifactors;
1920     *trifactors = 0;
1921   }
1922   PetscFunctionReturn(0);
1923 }
1924 
1925