xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision cf00fe3bd8118910b5dd973dc437575af1f9d82c)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127   switch (op) {
128   case MAT_CUSPARSE_MULT:
129     cusparsestruct->format = format;
130     break;
131   case MAT_CUSPARSE_ALL:
132     cusparsestruct->format = format;
133     break;
134   default:
135     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 /*@
141    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
142    operation. Only the MatMult operation can use different GPU storage formats
143    for MPIAIJCUSPARSE matrices.
144    Not Collective
145 
146    Input Parameters:
147 +  A - Matrix of type SEQAIJCUSPARSE
148 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
149 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
150 
151    Output Parameter:
152 
153    Level: intermediate
154 
155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
156 @*/
157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
158 {
159   PetscErrorCode ierr;
160 
161   PetscFunctionBegin;
162   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
163   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
164   PetscFunctionReturn(0);
165 }
166 
167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
168 {
169   PetscErrorCode           ierr;
170   MatCUSPARSEStorageFormat format;
171   PetscBool                flg;
172   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
173 
174   PetscFunctionBegin;
175   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
176   if (A->factortype==MAT_FACTOR_NONE) {
177     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
178                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179     if (flg) {
180       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
181     }
182   }
183   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
184                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185   if (flg) {
186     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
187   }
188   ierr = PetscOptionsTail();CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 
191 }
192 
193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
194 {
195   PetscErrorCode ierr;
196 
197   PetscFunctionBegin;
198   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
199   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
204 {
205   PetscErrorCode ierr;
206 
207   PetscFunctionBegin;
208   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
209   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
210   PetscFunctionReturn(0);
211 }
212 
213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
224 {
225   PetscErrorCode ierr;
226 
227   PetscFunctionBegin;
228   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
229   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
230   PetscFunctionReturn(0);
231 }
232 
233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
234 {
235   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
236   PetscInt                          n = A->rmap->n;
237   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
238   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
239   cusparseStatus_t                  stat;
240   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
241   const MatScalar                   *aa = a->a,*v;
242   PetscInt                          *AiLo, *AjLo;
243   PetscScalar                       *AALo;
244   PetscInt                          i,nz, nzLower, offset, rowOffset;
245   PetscErrorCode                    ierr;
246 
247   PetscFunctionBegin;
248   if (!n) PetscFunctionReturn(0);
249   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
250     try {
251       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
252       nzLower=n+ai[n]-ai[1];
253 
254       /* Allocate Space for the lower triangular matrix */
255       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
256       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
257       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
258 
259       /* Fill the lower triangular matrix */
260       AiLo[0]  = (PetscInt) 0;
261       AiLo[n]  = nzLower;
262       AjLo[0]  = (PetscInt) 0;
263       AALo[0]  = (MatScalar) 1.0;
264       v        = aa;
265       vi       = aj;
266       offset   = 1;
267       rowOffset= 1;
268       for (i=1; i<n; i++) {
269         nz = ai[i+1] - ai[i];
270         /* additional 1 for the term on the diagonal */
271         AiLo[i]    = rowOffset;
272         rowOffset += nz+1;
273 
274         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
275         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
276 
277         offset      += nz;
278         AjLo[offset] = (PetscInt) i;
279         AALo[offset] = (MatScalar) 1.0;
280         offset      += 1;
281 
282         v  += nz;
283         vi += nz;
284       }
285 
286       /* allocate space for the triangular factor information */
287       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
288 
289       /* Create the matrix description */
290       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
291       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
292       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
293       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
294       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
295 
296       /* Create the solve analysis information */
297       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
298 
299       /* set the operation */
300       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
301 
302       /* set the matrix */
303       loTriFactor->csrMat = new CsrMatrix;
304       loTriFactor->csrMat->num_rows = n;
305       loTriFactor->csrMat->num_cols = n;
306       loTriFactor->csrMat->num_entries = nzLower;
307 
308       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
309       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
310 
311       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
312       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
313 
314       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
315       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
316 
317       /* perform the solve analysis */
318       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
319                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
320                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
321                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
322 
323       /* assign the pointer. Is this really necessary? */
324       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
325 
326       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
327       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
328       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
329       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
330     } catch(char *ex) {
331       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
332     }
333   }
334   PetscFunctionReturn(0);
335 }
336 
337 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
338 {
339   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
340   PetscInt                          n = A->rmap->n;
341   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
342   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
343   cusparseStatus_t                  stat;
344   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
345   const MatScalar                   *aa = a->a,*v;
346   PetscInt                          *AiUp, *AjUp;
347   PetscScalar                       *AAUp;
348   PetscInt                          i,nz, nzUpper, offset;
349   PetscErrorCode                    ierr;
350 
351   PetscFunctionBegin;
352   if (!n) PetscFunctionReturn(0);
353   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
354     try {
355       /* next, figure out the number of nonzeros in the upper triangular matrix. */
356       nzUpper = adiag[0]-adiag[n];
357 
358       /* Allocate Space for the upper triangular matrix */
359       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
360       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
362 
363       /* Fill the upper triangular matrix */
364       AiUp[0]=(PetscInt) 0;
365       AiUp[n]=nzUpper;
366       offset = nzUpper;
367       for (i=n-1; i>=0; i--) {
368         v  = aa + adiag[i+1] + 1;
369         vi = aj + adiag[i+1] + 1;
370 
371         /* number of elements NOT on the diagonal */
372         nz = adiag[i] - adiag[i+1]-1;
373 
374         /* decrement the offset */
375         offset -= (nz+1);
376 
377         /* first, set the diagonal elements */
378         AjUp[offset] = (PetscInt) i;
379         AAUp[offset] = (MatScalar)1./v[nz];
380         AiUp[i]      = AiUp[i+1] - (nz+1);
381 
382         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
383         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
384       }
385 
386       /* allocate space for the triangular factor information */
387       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
388 
389       /* Create the matrix description */
390       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
391       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
392       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
393       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
394       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
395 
396       /* Create the solve analysis information */
397       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
398 
399       /* set the operation */
400       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401 
402       /* set the matrix */
403       upTriFactor->csrMat = new CsrMatrix;
404       upTriFactor->csrMat->num_rows = n;
405       upTriFactor->csrMat->num_cols = n;
406       upTriFactor->csrMat->num_entries = nzUpper;
407 
408       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
409       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
410 
411       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
412       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
413 
414       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
415       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
416 
417       /* perform the solve analysis */
418       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
419                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
420                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
421                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
422 
423       /* assign the pointer. Is this really necessary? */
424       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
425 
426       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
427       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
429       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
430     } catch(char *ex) {
431       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
432     }
433   }
434   PetscFunctionReturn(0);
435 }
436 
437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
438 {
439   PetscErrorCode               ierr;
440   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
441   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
442   IS                           isrow = a->row,iscol = a->icol;
443   PetscBool                    row_identity,col_identity;
444   const PetscInt               *r,*c;
445   PetscInt                     n = A->rmap->n;
446 
447   PetscFunctionBegin;
448   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
449   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
450 
451   cusparseTriFactors->workVector = new THRUSTARRAY(n);
452   cusparseTriFactors->nnz=a->nz;
453 
454   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
455   /*lower triangular indices */
456   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
457   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
458   if (!row_identity) {
459     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460     cusparseTriFactors->rpermIndices->assign(r, r+n);
461   }
462   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
463 
464   /*upper triangular indices */
465   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
466   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
467   if (!col_identity) {
468     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469     cusparseTriFactors->cpermIndices->assign(c, c+n);
470   }
471 
472   if(!row_identity && !col_identity) {
473     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
474   } else if(!row_identity) {
475     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!col_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   }
479 
480   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
481   PetscFunctionReturn(0);
482 }
483 
484 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
485 {
486   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
487   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
488   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
489   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
490   cusparseStatus_t                  stat;
491   PetscErrorCode                    ierr;
492   PetscInt                          *AiUp, *AjUp;
493   PetscScalar                       *AAUp;
494   PetscScalar                       *AALo;
495   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
496   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
497   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
498   const MatScalar                   *aa = b->a,*v;
499 
500   PetscFunctionBegin;
501   if (!n) PetscFunctionReturn(0);
502   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
503     try {
504       /* Allocate Space for the upper triangular matrix */
505       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
506       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509 
510       /* Fill the upper triangular matrix */
511       AiUp[0]=(PetscInt) 0;
512       AiUp[n]=nzUpper;
513       offset = 0;
514       for (i=0; i<n; i++) {
515         /* set the pointers */
516         v  = aa + ai[i];
517         vj = aj + ai[i];
518         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
519 
520         /* first, set the diagonal elements */
521         AjUp[offset] = (PetscInt) i;
522         AAUp[offset] = (MatScalar)1.0/v[nz];
523         AiUp[i]      = offset;
524         AALo[offset] = (MatScalar)1.0/v[nz];
525 
526         offset+=1;
527         if (nz>0) {
528           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
529           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
530           for (j=offset; j<offset+nz; j++) {
531             AAUp[j] = -AAUp[j];
532             AALo[j] = AAUp[j]/v[nz];
533           }
534           offset+=nz;
535         }
536       }
537 
538       /* allocate space for the triangular factor information */
539       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
540 
541       /* Create the matrix description */
542       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
543       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
544       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
545       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
546       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
547 
548       /* Create the solve analysis information */
549       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
550 
551       /* set the operation */
552       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
553 
554       /* set the matrix */
555       upTriFactor->csrMat = new CsrMatrix;
556       upTriFactor->csrMat->num_rows = A->rmap->n;
557       upTriFactor->csrMat->num_cols = A->cmap->n;
558       upTriFactor->csrMat->num_entries = a->nz;
559 
560       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
561       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
562 
563       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
564       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
565 
566       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
567       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
568 
569       /* perform the solve analysis */
570       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
571                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
572                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
573                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
574 
575       /* assign the pointer. Is this really necessary? */
576       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
577 
578       /* allocate space for the triangular factor information */
579       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
580 
581       /* Create the matrix description */
582       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
583       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
584       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
585       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
586       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
587 
588       /* Create the solve analysis information */
589       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
590 
591       /* set the operation */
592       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
593 
594       /* set the matrix */
595       loTriFactor->csrMat = new CsrMatrix;
596       loTriFactor->csrMat->num_rows = A->rmap->n;
597       loTriFactor->csrMat->num_cols = A->cmap->n;
598       loTriFactor->csrMat->num_entries = a->nz;
599 
600       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
601       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
602 
603       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
604       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
605 
606       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
607       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
608       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
609 
610       /* perform the solve analysis */
611       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
612                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
613                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
614                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
615 
616       /* assign the pointer. Is this really necessary? */
617       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
618 
619       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
620       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
621       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
624     } catch(char *ex) {
625       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
626     }
627   }
628   PetscFunctionReturn(0);
629 }
630 
631 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
632 {
633   PetscErrorCode               ierr;
634   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
635   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
636   IS                           ip = a->row;
637   const PetscInt               *rip;
638   PetscBool                    perm_identity;
639   PetscInt                     n = A->rmap->n;
640 
641   PetscFunctionBegin;
642   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
643   cusparseTriFactors->workVector = new THRUSTARRAY(n);
644   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
645 
646   /*lower triangular indices */
647   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
648   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
649   if (!perm_identity) {
650     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
651     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
652     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
653     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
654     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
655   }
656   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
657   PetscFunctionReturn(0);
658 }
659 
660 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
661 {
662   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
663   IS             isrow = b->row,iscol = b->col;
664   PetscBool      row_identity,col_identity;
665   PetscErrorCode ierr;
666 
667   PetscFunctionBegin;
668   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
669   /* determine which version of MatSolve needs to be used. */
670   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
671   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
672   if (row_identity && col_identity) {
673     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
674     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
675   } else {
676     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
677     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
678   }
679 
680   /* get the triangular factors */
681   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
686 {
687   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
688   IS             ip = b->row;
689   PetscBool      perm_identity;
690   PetscErrorCode ierr;
691 
692   PetscFunctionBegin;
693   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
694 
695   /* determine which version of MatSolve needs to be used. */
696   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
697   if (perm_identity) {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
700   } else {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
703   }
704 
705   /* get the triangular factors */
706   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
711 {
712   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
713   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
716   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
717   cusparseStatus_t                  stat;
718   cusparseIndexBase_t               indexBase;
719   cusparseMatrixType_t              matrixType;
720   cusparseFillMode_t                fillMode;
721   cusparseDiagType_t                diagType;
722 
723   PetscFunctionBegin;
724 
725   /*********************************************/
726   /* Now the Transpose of the Lower Tri Factor */
727   /*********************************************/
728 
729   /* allocate space for the transpose of the lower triangular factor */
730   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
731 
732   /* set the matrix descriptors of the lower triangular factor */
733   matrixType = cusparseGetMatType(loTriFactor->descr);
734   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
735   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
736     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
737   diagType = cusparseGetMatDiagType(loTriFactor->descr);
738 
739   /* Create the matrix description */
740   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
741   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
742   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
743   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
744   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
745 
746   /* Create the solve analysis information */
747   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
748 
749   /* set the operation */
750   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
751 
752   /* allocate GPU space for the CSC of the lower triangular factor*/
753   loTriFactorT->csrMat = new CsrMatrix;
754   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
755   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
756   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
757   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
758   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
759   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
760 
761   /* compute the transpose of the lower triangular factor, i.e. the CSC */
762   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
763                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
764                           loTriFactor->csrMat->values->data().get(),
765                           loTriFactor->csrMat->row_offsets->data().get(),
766                           loTriFactor->csrMat->column_indices->data().get(),
767                           loTriFactorT->csrMat->values->data().get(),
768                           loTriFactorT->csrMat->column_indices->data().get(),
769                           loTriFactorT->csrMat->row_offsets->data().get(),
770                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
771 
772   /* perform the solve analysis on the transposed matrix */
773   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
774                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
775                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
776                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
777                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
778 
779   /* assign the pointer. Is this really necessary? */
780   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
781 
782   /*********************************************/
783   /* Now the Transpose of the Upper Tri Factor */
784   /*********************************************/
785 
786   /* allocate space for the transpose of the upper triangular factor */
787   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
788 
789   /* set the matrix descriptors of the upper triangular factor */
790   matrixType = cusparseGetMatType(upTriFactor->descr);
791   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
792   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
793     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
794   diagType = cusparseGetMatDiagType(upTriFactor->descr);
795 
796   /* Create the matrix description */
797   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
798   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
799   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
800   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
801   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
802 
803   /* Create the solve analysis information */
804   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
805 
806   /* set the operation */
807   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
808 
809   /* allocate GPU space for the CSC of the upper triangular factor*/
810   upTriFactorT->csrMat = new CsrMatrix;
811   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
812   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
813   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
814   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
815   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
816   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
817 
818   /* compute the transpose of the upper triangular factor, i.e. the CSC */
819   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
820                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
821                           upTriFactor->csrMat->values->data().get(),
822                           upTriFactor->csrMat->row_offsets->data().get(),
823                           upTriFactor->csrMat->column_indices->data().get(),
824                           upTriFactorT->csrMat->values->data().get(),
825                           upTriFactorT->csrMat->column_indices->data().get(),
826                           upTriFactorT->csrMat->row_offsets->data().get(),
827                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
828 
829   /* perform the solve analysis on the transposed matrix */
830   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
831                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
832                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
833                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
834                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
835 
836   /* assign the pointer. Is this really necessary? */
837   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
838   PetscFunctionReturn(0);
839 }
840 
841 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
842 {
843   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
844   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
845   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
846   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
847   cusparseStatus_t             stat;
848   cusparseIndexBase_t          indexBase;
849   cudaError_t                  err;
850   PetscErrorCode               ierr;
851 
852   PetscFunctionBegin;
853 
854   /* allocate space for the triangular factor information */
855   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
856   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
857   indexBase = cusparseGetMatIndexBase(matstruct->descr);
858   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
859   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
860 
861   /* set alpha and beta */
862   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
863   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
869 
870   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
871     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
872     CsrMatrix *matrixT= new CsrMatrix;
873     matrixT->num_rows = A->cmap->n;
874     matrixT->num_cols = A->rmap->n;
875     matrixT->num_entries = a->nz;
876     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
877     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
878     matrixT->values = new THRUSTARRAY(a->nz);
879 
880     /* compute the transpose of the upper triangular factor, i.e. the CSC */
881     indexBase = cusparseGetMatIndexBase(matstruct->descr);
882     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
883                             matrix->num_cols, matrix->num_entries,
884                             matrix->values->data().get(),
885                             matrix->row_offsets->data().get(),
886                             matrix->column_indices->data().get(),
887                             matrixT->values->data().get(),
888                             matrixT->column_indices->data().get(),
889                             matrixT->row_offsets->data().get(),
890                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
891 
892     /* assign the pointer */
893     matstructT->mat = matrixT;
894     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
895   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
896     /* First convert HYB to CSR */
897     CsrMatrix *temp= new CsrMatrix;
898     temp->num_rows = A->rmap->n;
899     temp->num_cols = A->cmap->n;
900     temp->num_entries = a->nz;
901     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
902     temp->column_indices = new THRUSTINTARRAY32(a->nz);
903     temp->values = new THRUSTARRAY(a->nz);
904 
905 
906     stat = cusparse_hyb2csr(cusparsestruct->handle,
907                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
908                             temp->values->data().get(),
909                             temp->row_offsets->data().get(),
910                             temp->column_indices->data().get());CHKERRCUDA(stat);
911 
912     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
913     CsrMatrix *tempT= new CsrMatrix;
914     tempT->num_rows = A->rmap->n;
915     tempT->num_cols = A->cmap->n;
916     tempT->num_entries = a->nz;
917     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
918     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
919     tempT->values = new THRUSTARRAY(a->nz);
920 
921     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
922                             temp->num_cols, temp->num_entries,
923                             temp->values->data().get(),
924                             temp->row_offsets->data().get(),
925                             temp->column_indices->data().get(),
926                             tempT->values->data().get(),
927                             tempT->column_indices->data().get(),
928                             tempT->row_offsets->data().get(),
929                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
930 
931     /* Last, convert CSC to HYB */
932     cusparseHybMat_t hybMat;
933     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
934     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
935       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
936     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
937                             matstructT->descr, tempT->values->data().get(),
938                             tempT->row_offsets->data().get(),
939                             tempT->column_indices->data().get(),
940                             hybMat, 0, partition);CHKERRCUDA(stat);
941 
942     /* assign the pointer */
943     matstructT->mat = hybMat;
944     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
945 
946     /* delete temporaries */
947     if (tempT) {
948       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
949       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
950       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
951       delete (CsrMatrix*) tempT;
952     }
953     if (temp) {
954       if (temp->values) delete (THRUSTARRAY*) temp->values;
955       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
956       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
957       delete (CsrMatrix*) temp;
958     }
959   }
960   /* assign the compressed row indices */
961   matstructT->cprowIndices = new THRUSTINTARRAY;
962   matstructT->cprowIndices->resize(A->cmap->n);
963   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
964   /* assign the pointer */
965   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
966   PetscFunctionReturn(0);
967 }
968 
969 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
970 {
971   PetscInt                              n = xx->map->n;
972   const PetscScalar                     *barray;
973   PetscScalar                           *xarray;
974   thrust::device_ptr<const PetscScalar> bGPU;
975   thrust::device_ptr<PetscScalar>       xGPU;
976   cusparseStatus_t                      stat;
977   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
978   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
979   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
980   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
981   PetscErrorCode                        ierr;
982 
983   PetscFunctionBegin;
984   /* Analyze the matrix and create the transpose ... on the fly */
985   if (!loTriFactorT && !upTriFactorT) {
986     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
987     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
988     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
989   }
990 
991   /* Get the GPU pointers */
992   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
993   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
994   xGPU = thrust::device_pointer_cast(xarray);
995   bGPU = thrust::device_pointer_cast(barray);
996 
997   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
998   /* First, reorder with the row permutation */
999   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1000                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1001                xGPU);
1002 
1003   /* First, solve U */
1004   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1005                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1006                         upTriFactorT->csrMat->values->data().get(),
1007                         upTriFactorT->csrMat->row_offsets->data().get(),
1008                         upTriFactorT->csrMat->column_indices->data().get(),
1009                         upTriFactorT->solveInfo,
1010                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1011 
1012   /* Then, solve L */
1013   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1014                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1015                         loTriFactorT->csrMat->values->data().get(),
1016                         loTriFactorT->csrMat->row_offsets->data().get(),
1017                         loTriFactorT->csrMat->column_indices->data().get(),
1018                         loTriFactorT->solveInfo,
1019                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1020 
1021   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1022   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1023                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1024                tempGPU->begin());
1025 
1026   /* Copy the temporary to the full solution. */
1027   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1028 
1029   /* restore */
1030   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1031   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1032   ierr = WaitForGPU();CHKERRCUDA(ierr);
1033   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1034   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1035   PetscFunctionReturn(0);
1036 }
1037 
1038 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1039 {
1040   const PetscScalar                 *barray;
1041   PetscScalar                       *xarray;
1042   cusparseStatus_t                  stat;
1043   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1044   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1045   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1046   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1047   PetscErrorCode                    ierr;
1048 
1049   PetscFunctionBegin;
1050   /* Analyze the matrix and create the transpose ... on the fly */
1051   if (!loTriFactorT && !upTriFactorT) {
1052     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1053     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1054     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1055   }
1056 
1057   /* Get the GPU pointers */
1058   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1059   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1060 
1061   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1062   /* First, solve U */
1063   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1064                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1065                         upTriFactorT->csrMat->values->data().get(),
1066                         upTriFactorT->csrMat->row_offsets->data().get(),
1067                         upTriFactorT->csrMat->column_indices->data().get(),
1068                         upTriFactorT->solveInfo,
1069                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1070 
1071   /* Then, solve L */
1072   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1074                         loTriFactorT->csrMat->values->data().get(),
1075                         loTriFactorT->csrMat->row_offsets->data().get(),
1076                         loTriFactorT->csrMat->column_indices->data().get(),
1077                         loTriFactorT->solveInfo,
1078                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1079 
1080   /* restore */
1081   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1082   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1083   ierr = WaitForGPU();CHKERRCUDA(ierr);
1084   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1085   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1090 {
1091   const PetscScalar                     *barray;
1092   PetscScalar                           *xarray;
1093   thrust::device_ptr<const PetscScalar> bGPU;
1094   thrust::device_ptr<PetscScalar>       xGPU;
1095   cusparseStatus_t                      stat;
1096   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1097   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1098   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1099   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1100   PetscErrorCode                        ierr;
1101 
1102   PetscFunctionBegin;
1103 
1104   /* Get the GPU pointers */
1105   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1106   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1107   xGPU = thrust::device_pointer_cast(xarray);
1108   bGPU = thrust::device_pointer_cast(barray);
1109 
1110   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1111   /* First, reorder with the row permutation */
1112   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1113                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1114                xGPU);
1115 
1116   /* Next, solve L */
1117   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1118                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1119                         loTriFactor->csrMat->values->data().get(),
1120                         loTriFactor->csrMat->row_offsets->data().get(),
1121                         loTriFactor->csrMat->column_indices->data().get(),
1122                         loTriFactor->solveInfo,
1123                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1124 
1125   /* Then, solve U */
1126   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1127                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1128                         upTriFactor->csrMat->values->data().get(),
1129                         upTriFactor->csrMat->row_offsets->data().get(),
1130                         upTriFactor->csrMat->column_indices->data().get(),
1131                         upTriFactor->solveInfo,
1132                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1133 
1134   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1135   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1136                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1137                tempGPU->begin());
1138 
1139   /* Copy the temporary to the full solution. */
1140   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1141 
1142   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1143   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1144   ierr = WaitForGPU();CHKERRCUDA(ierr);
1145   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1146   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1151 {
1152   const PetscScalar                 *barray;
1153   PetscScalar                       *xarray;
1154   cusparseStatus_t                  stat;
1155   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1156   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1157   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1158   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1159   PetscErrorCode                    ierr;
1160 
1161   PetscFunctionBegin;
1162   /* Get the GPU pointers */
1163   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1164   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1165 
1166   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1167   /* First, solve L */
1168   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1169                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1170                         loTriFactor->csrMat->values->data().get(),
1171                         loTriFactor->csrMat->row_offsets->data().get(),
1172                         loTriFactor->csrMat->column_indices->data().get(),
1173                         loTriFactor->solveInfo,
1174                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1175 
1176   /* Next, solve U */
1177   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1178                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1179                         upTriFactor->csrMat->values->data().get(),
1180                         upTriFactor->csrMat->row_offsets->data().get(),
1181                         upTriFactor->csrMat->column_indices->data().get(),
1182                         upTriFactor->solveInfo,
1183                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1184 
1185   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1186   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1187   ierr = WaitForGPU();CHKERRCUDA(ierr);
1188   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1189   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1190   PetscFunctionReturn(0);
1191 }
1192 
1193 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1194 {
1195   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1196   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1197   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1198   PetscInt                     m = A->rmap->n,*ii,*ridx;
1199   PetscErrorCode               ierr;
1200   cusparseStatus_t             stat;
1201   cudaError_t                  err;
1202 
1203   PetscFunctionBegin;
1204   if (A->pinnedtocpu) PetscFunctionReturn(0);
1205   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1206     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1207     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1208       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1209       /* copy values only */
1210       matrix->values->assign(a->a, a->a+a->nz);
1211       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1212     } else {
1213       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1214       try {
1215         cusparsestruct->nonzerorow=0;
1216         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1217 
1218         if (a->compressedrow.use) {
1219           m    = a->compressedrow.nrows;
1220           ii   = a->compressedrow.i;
1221           ridx = a->compressedrow.rindex;
1222         } else {
1223           /* Forcing compressed row on the GPU */
1224           int k=0;
1225           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1226           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1227           ii[0]=0;
1228           for (int j = 0; j<m; j++) {
1229             if ((a->i[j+1]-a->i[j])>0) {
1230               ii[k]  = a->i[j];
1231               ridx[k]= j;
1232               k++;
1233             }
1234           }
1235           ii[cusparsestruct->nonzerorow] = a->nz;
1236           m = cusparsestruct->nonzerorow;
1237         }
1238 
1239         /* allocate space for the triangular factor information */
1240         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1241         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1242         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1243         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1244 
1245         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1246         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1247         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1248         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1249         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1250         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1251         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1252 
1253         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1254         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1255           /* set the matrix */
1256           CsrMatrix *matrix= new CsrMatrix;
1257           matrix->num_rows = m;
1258           matrix->num_cols = A->cmap->n;
1259           matrix->num_entries = a->nz;
1260           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1261           matrix->row_offsets->assign(ii, ii + m+1);
1262 
1263           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1264           matrix->column_indices->assign(a->j, a->j+a->nz);
1265 
1266           matrix->values = new THRUSTARRAY(a->nz);
1267           matrix->values->assign(a->a, a->a+a->nz);
1268 
1269           /* assign the pointer */
1270           matstruct->mat = matrix;
1271 
1272         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1273           CsrMatrix *matrix= new CsrMatrix;
1274           matrix->num_rows = m;
1275           matrix->num_cols = A->cmap->n;
1276           matrix->num_entries = a->nz;
1277           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1278           matrix->row_offsets->assign(ii, ii + m+1);
1279 
1280           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1281           matrix->column_indices->assign(a->j, a->j+a->nz);
1282 
1283           matrix->values = new THRUSTARRAY(a->nz);
1284           matrix->values->assign(a->a, a->a+a->nz);
1285 
1286           cusparseHybMat_t hybMat;
1287           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1288           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1289             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1290           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1291               matstruct->descr, matrix->values->data().get(),
1292               matrix->row_offsets->data().get(),
1293               matrix->column_indices->data().get(),
1294               hybMat, 0, partition);CHKERRCUDA(stat);
1295           /* assign the pointer */
1296           matstruct->mat = hybMat;
1297 
1298           if (matrix) {
1299             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1300             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1301             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1302             delete (CsrMatrix*)matrix;
1303           }
1304         }
1305 
1306         /* assign the compressed row indices */
1307         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1308         matstruct->cprowIndices->assign(ridx,ridx+m);
1309         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1310 
1311         /* assign the pointer */
1312         cusparsestruct->mat = matstruct;
1313 
1314         if (!a->compressedrow.use) {
1315           ierr = PetscFree(ii);CHKERRQ(ierr);
1316           ierr = PetscFree(ridx);CHKERRQ(ierr);
1317         }
1318         cusparsestruct->workVector = new THRUSTARRAY(m);
1319       } catch(char *ex) {
1320         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1321       }
1322       cusparsestruct->nonzerostate = A->nonzerostate;
1323     }
1324     ierr = WaitForGPU();CHKERRCUDA(ierr);
1325     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1326     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1327   }
1328   PetscFunctionReturn(0);
1329 }
1330 
1331 struct VecCUDAPlusEquals
1332 {
1333   template <typename Tuple>
1334   __host__ __device__
1335   void operator()(Tuple t)
1336   {
1337     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1338   }
1339 };
1340 
1341 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1342 {
1343   PetscErrorCode ierr;
1344 
1345   PetscFunctionBegin;
1346   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1351 {
1352   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1353   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1354   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1355   const PetscScalar            *xarray;
1356   PetscScalar                  *yarray;
1357   PetscErrorCode               ierr;
1358   cusparseStatus_t             stat;
1359 
1360   PetscFunctionBegin;
1361   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1362   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1363   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1364   if (!matstructT) {
1365     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1366     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1367   }
1368   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1369   ierr = VecSet(yy,0);CHKERRQ(ierr);
1370   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1371 
1372   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1373   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1374     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1375     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1376                              mat->num_rows, mat->num_cols,
1377                              mat->num_entries, matstructT->alpha, matstructT->descr,
1378                              mat->values->data().get(), mat->row_offsets->data().get(),
1379                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1380                              yarray);CHKERRCUDA(stat);
1381   } else {
1382     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1383     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1384                              matstructT->alpha, matstructT->descr, hybMat,
1385                              xarray, matstructT->beta_zero,
1386                              yarray);CHKERRCUDA(stat);
1387   }
1388   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1389   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1390   if (!cusparsestruct->stream) {
1391     ierr = WaitForGPU();CHKERRCUDA(ierr);
1392   }
1393   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1394   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 
1399 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1400 {
1401   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1402   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1403   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1404   const PetscScalar            *xarray;
1405   PetscScalar                  *zarray,*dptr,*beta;
1406   PetscErrorCode               ierr;
1407   cusparseStatus_t             stat;
1408   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1409 
1410   PetscFunctionBegin;
1411   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1412   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1413   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1414   try {
1415     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1416     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1417     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1418       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1419     } else {
1420       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1421     }
1422     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1423     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1424 
1425     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1426     /* csr_spmv is multiply add */
1427     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1428       /* here we need to be careful to set the number of rows in the multiply to the
1429          number of compressed rows in the matrix ... which is equivalent to the
1430          size of the workVector */
1431       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1432       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1433                                mat->num_rows, mat->num_cols,
1434                                mat->num_entries, matstruct->alpha, matstruct->descr,
1435                                mat->values->data().get(), mat->row_offsets->data().get(),
1436                                mat->column_indices->data().get(), xarray, beta,
1437                                dptr);CHKERRCUDA(stat);
1438     } else {
1439       if (cusparsestruct->workVector->size()) {
1440 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1441         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1442                                  matstruct->alpha, matstruct->descr, hybMat,
1443                                  xarray, beta,
1444                                  dptr);CHKERRCUDA(stat);
1445       }
1446     }
1447     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1448 
1449     if (yy) {
1450       if (dptr != zarray) {
1451         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1452       } else if (zz != yy) {
1453         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1454       }
1455     } else if (dptr != zarray) {
1456       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1457     }
1458     /* scatter the data from the temporary into the full vector with a += operation */
1459     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1460     if (dptr != zarray) {
1461       thrust::device_ptr<PetscScalar> zptr;
1462 
1463       zptr = thrust::device_pointer_cast(zarray);
1464       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1465                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1466                        VecCUDAPlusEquals());
1467     }
1468     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1469     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1470     if (yy && !cmpr) {
1471       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1472     } else {
1473       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1474     }
1475   } catch(char *ex) {
1476     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1477   }
1478   if (!yy) { /* MatMult */
1479     if (!cusparsestruct->stream) {
1480       ierr = WaitForGPU();CHKERRCUDA(ierr);
1481     }
1482   }
1483   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1484   PetscFunctionReturn(0);
1485 }
1486 
1487 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1488 {
1489   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1490   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1491   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1492   thrust::device_ptr<PetscScalar> zptr;
1493   const PetscScalar               *xarray;
1494   PetscScalar                     *zarray;
1495   PetscErrorCode                  ierr;
1496   cusparseStatus_t                stat;
1497 
1498   PetscFunctionBegin;
1499   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1500   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1501   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1502   if (!matstructT) {
1503     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1504     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1505     ierr = WaitForGPU();CHKERRCUDA(ierr);
1506   }
1507 
1508   try {
1509     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1510     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1511     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1512     zptr = thrust::device_pointer_cast(zarray);
1513 
1514     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1515     /* multiply add with matrix transpose */
1516     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1517       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1518       /* here we need to be careful to set the number of rows in the multiply to the
1519          number of compressed rows in the matrix ... which is equivalent to the
1520          size of the workVector */
1521       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1522                                mat->num_rows, mat->num_cols,
1523                                mat->num_entries, matstructT->alpha, matstructT->descr,
1524                                mat->values->data().get(), mat->row_offsets->data().get(),
1525                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1526                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1527     } else {
1528       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1529       if (cusparsestruct->workVector->size()) {
1530         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1531             matstructT->alpha, matstructT->descr, hybMat,
1532             xarray, matstructT->beta_zero,
1533             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1534       }
1535     }
1536 
1537     /* scatter the data from the temporary into the full vector with a += operation */
1538     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1539         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1540         VecCUDAPlusEquals());
1541     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1542     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1543     ierr = WaitForGPU();CHKERRCUDA(ierr);
1544     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1545 
1546   } catch(char *ex) {
1547     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1548     ierr = WaitForGPU();CHKERRCUDA(ierr);
1549   }
1550   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1555 {
1556   PetscErrorCode ierr;
1557 
1558   PetscFunctionBegin;
1559   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1560   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1561   if (A->factortype == MAT_FACTOR_NONE) {
1562     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1563   }
1564   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1565   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1566   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1567   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 /* --------------------------------------------------------------------------------*/
1572 /*@
1573    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1574    (the default parallel PETSc format). This matrix will ultimately pushed down
1575    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1576    assembly performance the user should preallocate the matrix storage by setting
1577    the parameter nz (or the array nnz).  By setting these parameters accurately,
1578    performance during matrix assembly can be increased by more than a factor of 50.
1579 
1580    Collective
1581 
1582    Input Parameters:
1583 +  comm - MPI communicator, set to PETSC_COMM_SELF
1584 .  m - number of rows
1585 .  n - number of columns
1586 .  nz - number of nonzeros per row (same for all rows)
1587 -  nnz - array containing the number of nonzeros in the various rows
1588          (possibly different for each row) or NULL
1589 
1590    Output Parameter:
1591 .  A - the matrix
1592 
1593    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1594    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1595    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1596 
1597    Notes:
1598    If nnz is given then nz is ignored
1599 
1600    The AIJ format (also called the Yale sparse matrix format or
1601    compressed row storage), is fully compatible with standard Fortran 77
1602    storage.  That is, the stored row and column indices can begin at
1603    either one (as in Fortran) or zero.  See the users' manual for details.
1604 
1605    Specify the preallocated storage with either nz or nnz (not both).
1606    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1607    allocation.  For large problems you MUST preallocate memory or you
1608    will get TERRIBLE performance, see the users' manual chapter on matrices.
1609 
1610    By default, this format uses inodes (identical nodes) when possible, to
1611    improve numerical efficiency of matrix-vector products and solves. We
1612    search for consecutive rows with the same nonzero structure, thereby
1613    reusing matrix information to achieve increased efficiency.
1614 
1615    Level: intermediate
1616 
1617 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1618 @*/
1619 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1620 {
1621   PetscErrorCode ierr;
1622 
1623   PetscFunctionBegin;
1624   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1625   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1626   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1627   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1632 {
1633   PetscErrorCode   ierr;
1634 
1635   PetscFunctionBegin;
1636   if (A->factortype==MAT_FACTOR_NONE) {
1637     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1638       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1639     }
1640   } else {
1641     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1642   }
1643   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1644   PetscFunctionReturn(0);
1645 }
1646 
1647 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1648 {
1649   PetscErrorCode ierr;
1650   Mat C;
1651   cusparseStatus_t stat;
1652   cusparseHandle_t handle=0;
1653 
1654   PetscFunctionBegin;
1655   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1656   C    = *B;
1657   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1658   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1659 
1660   /* inject CUSPARSE-specific stuff */
1661   if (C->factortype==MAT_FACTOR_NONE) {
1662     /* you cannot check the inode.use flag here since the matrix was just created.
1663        now build a GPU matrix data structure */
1664     C->spptr = new Mat_SeqAIJCUSPARSE;
1665     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1666     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1667     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1668     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1669     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1670     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1671     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1672     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1673   } else {
1674     /* NEXT, set the pointers to the triangular factors */
1675     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1676     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1677     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1678     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1679     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1680     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1681     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1682     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1683     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1684     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1685     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1686     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1687   }
1688 
1689   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1690   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1691   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1692   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1693   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1694   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1695   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1696   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1697 
1698   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1699 
1700   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1701 
1702   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1703   PetscFunctionReturn(0);
1704 }
1705 
1706 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1707 {
1708   PetscErrorCode ierr;
1709   cusparseStatus_t stat;
1710   cusparseHandle_t handle=0;
1711 
1712   PetscFunctionBegin;
1713   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1714   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1715 
1716   if (B->factortype==MAT_FACTOR_NONE) {
1717     /* you cannot check the inode.use flag here since the matrix was just created.
1718        now build a GPU matrix data structure */
1719     B->spptr = new Mat_SeqAIJCUSPARSE;
1720     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1721     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1722     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1723     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1724     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1725     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1726     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1727     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1728   } else {
1729     /* NEXT, set the pointers to the triangular factors */
1730     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1731     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1733     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1737     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1738     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1739     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1740     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1741   }
1742 
1743   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1744   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1745   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1746   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1747   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1748   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1749   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1750   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1751 
1752   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1753 
1754   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1755 
1756   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1761 {
1762   PetscErrorCode ierr;
1763 
1764   PetscFunctionBegin;
1765   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1766   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1767   PetscFunctionReturn(0);
1768 }
1769 
1770 /*MC
1771    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1772 
1773    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1774    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1775    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1776 
1777    Options Database Keys:
1778 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1779 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1780 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1781 
1782   Level: beginner
1783 
1784 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1785 M*/
1786 
1787 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1788 
1789 
1790 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1791 {
1792   PetscErrorCode ierr;
1793 
1794   PetscFunctionBegin;
1795   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1796   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1797   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1798   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1799   PetscFunctionReturn(0);
1800 }
1801 
1802 
1803 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1804 {
1805   cusparseStatus_t stat;
1806   cusparseHandle_t handle;
1807 
1808   PetscFunctionBegin;
1809   if (*cusparsestruct) {
1810     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1811     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1812     delete (*cusparsestruct)->workVector;
1813     if (handle = (*cusparsestruct)->handle) {
1814       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1815     }
1816     delete *cusparsestruct;
1817     *cusparsestruct = 0;
1818   }
1819   PetscFunctionReturn(0);
1820 }
1821 
1822 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1823 {
1824   PetscFunctionBegin;
1825   if (*mat) {
1826     delete (*mat)->values;
1827     delete (*mat)->column_indices;
1828     delete (*mat)->row_offsets;
1829     delete *mat;
1830     *mat = 0;
1831   }
1832   PetscFunctionReturn(0);
1833 }
1834 
1835 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1836 {
1837   cusparseStatus_t stat;
1838   PetscErrorCode   ierr;
1839 
1840   PetscFunctionBegin;
1841   if (*trifactor) {
1842     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1843     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1844     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1845     delete *trifactor;
1846     *trifactor = 0;
1847   }
1848   PetscFunctionReturn(0);
1849 }
1850 
1851 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1852 {
1853   CsrMatrix        *mat;
1854   cusparseStatus_t stat;
1855   cudaError_t      err;
1856 
1857   PetscFunctionBegin;
1858   if (*matstruct) {
1859     if ((*matstruct)->mat) {
1860       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1861         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1862         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1863       } else {
1864         mat = (CsrMatrix*)(*matstruct)->mat;
1865         CsrMatrix_Destroy(&mat);
1866       }
1867     }
1868     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1869     delete (*matstruct)->cprowIndices;
1870     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1871     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1872     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1873     delete *matstruct;
1874     *matstruct = 0;
1875   }
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1880 {
1881   cusparseHandle_t handle;
1882   cusparseStatus_t stat;
1883 
1884   PetscFunctionBegin;
1885   if (*trifactors) {
1886     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1887     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1888     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1889     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1890     delete (*trifactors)->rpermIndices;
1891     delete (*trifactors)->cpermIndices;
1892     delete (*trifactors)->workVector;
1893     if (handle = (*trifactors)->handle) {
1894       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1895     }
1896     delete *trifactors;
1897     *trifactors = 0;
1898   }
1899   PetscFunctionReturn(0);
1900 }
1901 
1902