xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 53a7a83062c7187dea4a23d019e2208d10924b85)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127   switch (op) {
128   case MAT_CUSPARSE_MULT:
129     cusparsestruct->format = format;
130     break;
131   case MAT_CUSPARSE_ALL:
132     cusparsestruct->format = format;
133     break;
134   default:
135     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 /*@
141    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
142    operation. Only the MatMult operation can use different GPU storage formats
143    for MPIAIJCUSPARSE matrices.
144    Not Collective
145 
146    Input Parameters:
147 +  A - Matrix of type SEQAIJCUSPARSE
148 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
149 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
150 
151    Output Parameter:
152 
153    Level: intermediate
154 
155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
156 @*/
157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
158 {
159   PetscErrorCode ierr;
160 
161   PetscFunctionBegin;
162   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
163   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
164   PetscFunctionReturn(0);
165 }
166 
167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
168 {
169   PetscErrorCode           ierr;
170   MatCUSPARSEStorageFormat format;
171   PetscBool                flg;
172   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
173 
174   PetscFunctionBegin;
175   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
176   if (A->factortype==MAT_FACTOR_NONE) {
177     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
178                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179     if (flg) {
180       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
181     }
182   }
183   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
184                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185   if (flg) {
186     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
187   }
188   ierr = PetscOptionsTail();CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 
191 }
192 
193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
194 {
195   PetscErrorCode ierr;
196 
197   PetscFunctionBegin;
198   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
199   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
204 {
205   PetscErrorCode ierr;
206 
207   PetscFunctionBegin;
208   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
209   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
210   PetscFunctionReturn(0);
211 }
212 
213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
224 {
225   PetscErrorCode ierr;
226 
227   PetscFunctionBegin;
228   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
229   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
230   PetscFunctionReturn(0);
231 }
232 
233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
234 {
235   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
236   PetscInt                          n = A->rmap->n;
237   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
238   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
239   cusparseStatus_t                  stat;
240   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
241   const MatScalar                   *aa = a->a,*v;
242   PetscInt                          *AiLo, *AjLo;
243   PetscScalar                       *AALo;
244   PetscInt                          i,nz, nzLower, offset, rowOffset;
245   PetscErrorCode                    ierr;
246 
247   PetscFunctionBegin;
248   if (!n) PetscFunctionReturn(0);
249   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
250     try {
251       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
252       nzLower=n+ai[n]-ai[1];
253 
254       /* Allocate Space for the lower triangular matrix */
255       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
256       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
257       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
258 
259       /* Fill the lower triangular matrix */
260       AiLo[0]  = (PetscInt) 0;
261       AiLo[n]  = nzLower;
262       AjLo[0]  = (PetscInt) 0;
263       AALo[0]  = (MatScalar) 1.0;
264       v        = aa;
265       vi       = aj;
266       offset   = 1;
267       rowOffset= 1;
268       for (i=1; i<n; i++) {
269         nz = ai[i+1] - ai[i];
270         /* additional 1 for the term on the diagonal */
271         AiLo[i]    = rowOffset;
272         rowOffset += nz+1;
273 
274         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
275         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
276 
277         offset      += nz;
278         AjLo[offset] = (PetscInt) i;
279         AALo[offset] = (MatScalar) 1.0;
280         offset      += 1;
281 
282         v  += nz;
283         vi += nz;
284       }
285 
286       /* allocate space for the triangular factor information */
287       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
288 
289       /* Create the matrix description */
290       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
291       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
292       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
293       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
294       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
295 
296       /* Create the solve analysis information */
297       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
298 
299       /* set the operation */
300       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
301 
302       /* set the matrix */
303       loTriFactor->csrMat = new CsrMatrix;
304       loTriFactor->csrMat->num_rows = n;
305       loTriFactor->csrMat->num_cols = n;
306       loTriFactor->csrMat->num_entries = nzLower;
307 
308       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
309       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
310 
311       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
312       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
313 
314       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
315       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
316 
317       /* perform the solve analysis */
318       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
319                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
320                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
321                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
322 
323       /* assign the pointer. Is this really necessary? */
324       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
325 
326       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
327       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
328       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
329       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
330     } catch(char *ex) {
331       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
332     }
333   }
334   PetscFunctionReturn(0);
335 }
336 
337 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
338 {
339   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
340   PetscInt                          n = A->rmap->n;
341   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
342   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
343   cusparseStatus_t                  stat;
344   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
345   const MatScalar                   *aa = a->a,*v;
346   PetscInt                          *AiUp, *AjUp;
347   PetscScalar                       *AAUp;
348   PetscInt                          i,nz, nzUpper, offset;
349   PetscErrorCode                    ierr;
350 
351   PetscFunctionBegin;
352   if (!n) PetscFunctionReturn(0);
353   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
354     try {
355       /* next, figure out the number of nonzeros in the upper triangular matrix. */
356       nzUpper = adiag[0]-adiag[n];
357 
358       /* Allocate Space for the upper triangular matrix */
359       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
360       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
362 
363       /* Fill the upper triangular matrix */
364       AiUp[0]=(PetscInt) 0;
365       AiUp[n]=nzUpper;
366       offset = nzUpper;
367       for (i=n-1; i>=0; i--) {
368         v  = aa + adiag[i+1] + 1;
369         vi = aj + adiag[i+1] + 1;
370 
371         /* number of elements NOT on the diagonal */
372         nz = adiag[i] - adiag[i+1]-1;
373 
374         /* decrement the offset */
375         offset -= (nz+1);
376 
377         /* first, set the diagonal elements */
378         AjUp[offset] = (PetscInt) i;
379         AAUp[offset] = (MatScalar)1./v[nz];
380         AiUp[i]      = AiUp[i+1] - (nz+1);
381 
382         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
383         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
384       }
385 
386       /* allocate space for the triangular factor information */
387       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
388 
389       /* Create the matrix description */
390       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
391       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
392       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
393       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
394       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
395 
396       /* Create the solve analysis information */
397       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
398 
399       /* set the operation */
400       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401 
402       /* set the matrix */
403       upTriFactor->csrMat = new CsrMatrix;
404       upTriFactor->csrMat->num_rows = n;
405       upTriFactor->csrMat->num_cols = n;
406       upTriFactor->csrMat->num_entries = nzUpper;
407 
408       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
409       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
410 
411       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
412       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
413 
414       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
415       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
416 
417       /* perform the solve analysis */
418       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
419                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
420                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
421                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
422 
423       /* assign the pointer. Is this really necessary? */
424       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
425 
426       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
427       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
429       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
430     } catch(char *ex) {
431       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
432     }
433   }
434   PetscFunctionReturn(0);
435 }
436 
437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
438 {
439   PetscErrorCode               ierr;
440   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
441   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
442   IS                           isrow = a->row,iscol = a->icol;
443   PetscBool                    row_identity,col_identity;
444   const PetscInt               *r,*c;
445   PetscInt                     n = A->rmap->n;
446 
447   PetscFunctionBegin;
448   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
449   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
450 
451   cusparseTriFactors->workVector = new THRUSTARRAY(n);
452   cusparseTriFactors->nnz=a->nz;
453 
454   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
455   /*lower triangular indices */
456   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
457   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
458   if (!row_identity) {
459     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460     cusparseTriFactors->rpermIndices->assign(r, r+n);
461   }
462   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
463 
464   /*upper triangular indices */
465   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
466   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
467   if (!col_identity) {
468     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469     cusparseTriFactors->cpermIndices->assign(c, c+n);
470   }
471 
472   if(!row_identity && !col_identity) {
473     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
474   } else if(!row_identity) {
475     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!col_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   }
479 
480   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
481   PetscFunctionReturn(0);
482 }
483 
484 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
485 {
486   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
487   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
488   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
489   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
490   cusparseStatus_t                  stat;
491   PetscErrorCode                    ierr;
492   PetscInt                          *AiUp, *AjUp;
493   PetscScalar                       *AAUp;
494   PetscScalar                       *AALo;
495   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
496   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
497   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
498   const MatScalar                   *aa = b->a,*v;
499 
500   PetscFunctionBegin;
501   if (!n) PetscFunctionReturn(0);
502   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
503     try {
504       /* Allocate Space for the upper triangular matrix */
505       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
506       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509 
510       /* Fill the upper triangular matrix */
511       AiUp[0]=(PetscInt) 0;
512       AiUp[n]=nzUpper;
513       offset = 0;
514       for (i=0; i<n; i++) {
515         /* set the pointers */
516         v  = aa + ai[i];
517         vj = aj + ai[i];
518         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
519 
520         /* first, set the diagonal elements */
521         AjUp[offset] = (PetscInt) i;
522         AAUp[offset] = (MatScalar)1.0/v[nz];
523         AiUp[i]      = offset;
524         AALo[offset] = (MatScalar)1.0/v[nz];
525 
526         offset+=1;
527         if (nz>0) {
528           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
529           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
530           for (j=offset; j<offset+nz; j++) {
531             AAUp[j] = -AAUp[j];
532             AALo[j] = AAUp[j]/v[nz];
533           }
534           offset+=nz;
535         }
536       }
537 
538       /* allocate space for the triangular factor information */
539       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
540 
541       /* Create the matrix description */
542       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
543       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
544       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
545       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
546       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
547 
548       /* Create the solve analysis information */
549       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
550 
551       /* set the operation */
552       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
553 
554       /* set the matrix */
555       upTriFactor->csrMat = new CsrMatrix;
556       upTriFactor->csrMat->num_rows = A->rmap->n;
557       upTriFactor->csrMat->num_cols = A->cmap->n;
558       upTriFactor->csrMat->num_entries = a->nz;
559 
560       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
561       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
562 
563       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
564       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
565 
566       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
567       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
568 
569       /* perform the solve analysis */
570       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
571                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
572                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
573                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
574 
575       /* assign the pointer. Is this really necessary? */
576       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
577 
578       /* allocate space for the triangular factor information */
579       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
580 
581       /* Create the matrix description */
582       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
583       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
584       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
585       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
586       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
587 
588       /* Create the solve analysis information */
589       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
590 
591       /* set the operation */
592       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
593 
594       /* set the matrix */
595       loTriFactor->csrMat = new CsrMatrix;
596       loTriFactor->csrMat->num_rows = A->rmap->n;
597       loTriFactor->csrMat->num_cols = A->cmap->n;
598       loTriFactor->csrMat->num_entries = a->nz;
599 
600       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
601       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
602 
603       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
604       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
605 
606       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
607       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
608       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
609 
610       /* perform the solve analysis */
611       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
612                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
613                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
614                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
615 
616       /* assign the pointer. Is this really necessary? */
617       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
618 
619       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
620       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
621       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
624     } catch(char *ex) {
625       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
626     }
627   }
628   PetscFunctionReturn(0);
629 }
630 
631 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
632 {
633   PetscErrorCode               ierr;
634   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
635   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
636   IS                           ip = a->row;
637   const PetscInt               *rip;
638   PetscBool                    perm_identity;
639   PetscInt                     n = A->rmap->n;
640 
641   PetscFunctionBegin;
642   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
643   cusparseTriFactors->workVector = new THRUSTARRAY(n);
644   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
645 
646   /*lower triangular indices */
647   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
648   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
649   if (!perm_identity) {
650     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
651     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
652     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
653     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
654     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
655   }
656   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
657   PetscFunctionReturn(0);
658 }
659 
660 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
661 {
662   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
663   IS             isrow = b->row,iscol = b->col;
664   PetscBool      row_identity,col_identity;
665   PetscErrorCode ierr;
666 
667   PetscFunctionBegin;
668   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
669   /* determine which version of MatSolve needs to be used. */
670   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
671   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
672   if (row_identity && col_identity) {
673     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
674     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
675   } else {
676     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
677     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
678   }
679 
680   /* get the triangular factors */
681   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
686 {
687   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
688   IS             ip = b->row;
689   PetscBool      perm_identity;
690   PetscErrorCode ierr;
691 
692   PetscFunctionBegin;
693   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
694 
695   /* determine which version of MatSolve needs to be used. */
696   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
697   if (perm_identity) {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
700   } else {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
703   }
704 
705   /* get the triangular factors */
706   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
711 {
712   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
713   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
716   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
717   cusparseStatus_t                  stat;
718   cusparseIndexBase_t               indexBase;
719   cusparseMatrixType_t              matrixType;
720   cusparseFillMode_t                fillMode;
721   cusparseDiagType_t                diagType;
722 
723   PetscFunctionBegin;
724 
725   /*********************************************/
726   /* Now the Transpose of the Lower Tri Factor */
727   /*********************************************/
728 
729   /* allocate space for the transpose of the lower triangular factor */
730   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
731 
732   /* set the matrix descriptors of the lower triangular factor */
733   matrixType = cusparseGetMatType(loTriFactor->descr);
734   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
735   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
736     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
737   diagType = cusparseGetMatDiagType(loTriFactor->descr);
738 
739   /* Create the matrix description */
740   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
741   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
742   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
743   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
744   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
745 
746   /* Create the solve analysis information */
747   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
748 
749   /* set the operation */
750   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
751 
752   /* allocate GPU space for the CSC of the lower triangular factor*/
753   loTriFactorT->csrMat = new CsrMatrix;
754   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
755   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
756   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
757   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
758   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
759   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
760 
761   /* compute the transpose of the lower triangular factor, i.e. the CSC */
762   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
763                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
764                           loTriFactor->csrMat->values->data().get(),
765                           loTriFactor->csrMat->row_offsets->data().get(),
766                           loTriFactor->csrMat->column_indices->data().get(),
767                           loTriFactorT->csrMat->values->data().get(),
768                           loTriFactorT->csrMat->column_indices->data().get(),
769                           loTriFactorT->csrMat->row_offsets->data().get(),
770                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
771 
772   /* perform the solve analysis on the transposed matrix */
773   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
774                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
775                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
776                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
777                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
778 
779   /* assign the pointer. Is this really necessary? */
780   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
781 
782   /*********************************************/
783   /* Now the Transpose of the Upper Tri Factor */
784   /*********************************************/
785 
786   /* allocate space for the transpose of the upper triangular factor */
787   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
788 
789   /* set the matrix descriptors of the upper triangular factor */
790   matrixType = cusparseGetMatType(upTriFactor->descr);
791   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
792   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
793     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
794   diagType = cusparseGetMatDiagType(upTriFactor->descr);
795 
796   /* Create the matrix description */
797   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
798   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
799   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
800   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
801   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
802 
803   /* Create the solve analysis information */
804   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
805 
806   /* set the operation */
807   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
808 
809   /* allocate GPU space for the CSC of the upper triangular factor*/
810   upTriFactorT->csrMat = new CsrMatrix;
811   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
812   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
813   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
814   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
815   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
816   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
817 
818   /* compute the transpose of the upper triangular factor, i.e. the CSC */
819   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
820                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
821                           upTriFactor->csrMat->values->data().get(),
822                           upTriFactor->csrMat->row_offsets->data().get(),
823                           upTriFactor->csrMat->column_indices->data().get(),
824                           upTriFactorT->csrMat->values->data().get(),
825                           upTriFactorT->csrMat->column_indices->data().get(),
826                           upTriFactorT->csrMat->row_offsets->data().get(),
827                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
828 
829   /* perform the solve analysis on the transposed matrix */
830   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
831                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
832                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
833                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
834                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
835 
836   /* assign the pointer. Is this really necessary? */
837   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
838   PetscFunctionReturn(0);
839 }
840 
841 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
842 {
843   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
844   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
845   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
846   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
847   cusparseStatus_t             stat;
848   cusparseIndexBase_t          indexBase;
849   cudaError_t                  err;
850   PetscErrorCode               ierr;
851 
852   PetscFunctionBegin;
853 
854   /* allocate space for the triangular factor information */
855   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
856   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
857   indexBase = cusparseGetMatIndexBase(matstruct->descr);
858   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
859   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
860 
861   /* set alpha and beta */
862   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
863   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
869 
870   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
871     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
872     CsrMatrix *matrixT= new CsrMatrix;
873     matrixT->num_rows = A->cmap->n;
874     matrixT->num_cols = A->rmap->n;
875     matrixT->num_entries = a->nz;
876     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
877     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
878     matrixT->values = new THRUSTARRAY(a->nz);
879 
880     /* compute the transpose of the upper triangular factor, i.e. the CSC */
881     indexBase = cusparseGetMatIndexBase(matstruct->descr);
882     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
883                             matrix->num_cols, matrix->num_entries,
884                             matrix->values->data().get(),
885                             matrix->row_offsets->data().get(),
886                             matrix->column_indices->data().get(),
887                             matrixT->values->data().get(),
888                             matrixT->column_indices->data().get(),
889                             matrixT->row_offsets->data().get(),
890                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
891 
892     /* assign the pointer */
893     matstructT->mat = matrixT;
894     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
895   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
896     /* First convert HYB to CSR */
897     CsrMatrix *temp= new CsrMatrix;
898     temp->num_rows = A->rmap->n;
899     temp->num_cols = A->cmap->n;
900     temp->num_entries = a->nz;
901     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
902     temp->column_indices = new THRUSTINTARRAY32(a->nz);
903     temp->values = new THRUSTARRAY(a->nz);
904 
905 
906     stat = cusparse_hyb2csr(cusparsestruct->handle,
907                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
908                             temp->values->data().get(),
909                             temp->row_offsets->data().get(),
910                             temp->column_indices->data().get());CHKERRCUDA(stat);
911 
912     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
913     CsrMatrix *tempT= new CsrMatrix;
914     tempT->num_rows = A->rmap->n;
915     tempT->num_cols = A->cmap->n;
916     tempT->num_entries = a->nz;
917     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
918     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
919     tempT->values = new THRUSTARRAY(a->nz);
920 
921     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
922                             temp->num_cols, temp->num_entries,
923                             temp->values->data().get(),
924                             temp->row_offsets->data().get(),
925                             temp->column_indices->data().get(),
926                             tempT->values->data().get(),
927                             tempT->column_indices->data().get(),
928                             tempT->row_offsets->data().get(),
929                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
930 
931     /* Last, convert CSC to HYB */
932     cusparseHybMat_t hybMat;
933     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
934     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
935       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
936     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
937                             matstructT->descr, tempT->values->data().get(),
938                             tempT->row_offsets->data().get(),
939                             tempT->column_indices->data().get(),
940                             hybMat, 0, partition);CHKERRCUDA(stat);
941 
942     /* assign the pointer */
943     matstructT->mat = hybMat;
944     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
945 
946     /* delete temporaries */
947     if (tempT) {
948       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
949       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
950       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
951       delete (CsrMatrix*) tempT;
952     }
953     if (temp) {
954       if (temp->values) delete (THRUSTARRAY*) temp->values;
955       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
956       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
957       delete (CsrMatrix*) temp;
958     }
959   }
960   /* assign the compressed row indices */
961   matstructT->cprowIndices = new THRUSTINTARRAY;
962   matstructT->cprowIndices->resize(A->cmap->n);
963   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
964   /* assign the pointer */
965   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
966   PetscFunctionReturn(0);
967 }
968 
969 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
970 {
971   PetscInt                              n = xx->map->n;
972   const PetscScalar                     *barray;
973   PetscScalar                           *xarray;
974   thrust::device_ptr<const PetscScalar> bGPU;
975   thrust::device_ptr<PetscScalar>       xGPU;
976   cusparseStatus_t                      stat;
977   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
978   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
979   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
980   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
981   PetscErrorCode                        ierr;
982 
983   PetscFunctionBegin;
984   /* Analyze the matrix and create the transpose ... on the fly */
985   if (!loTriFactorT && !upTriFactorT) {
986     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
987     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
988     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
989   }
990 
991   /* Get the GPU pointers */
992   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
993   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
994   xGPU = thrust::device_pointer_cast(xarray);
995   bGPU = thrust::device_pointer_cast(barray);
996 
997   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
998   /* First, reorder with the row permutation */
999   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1000                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1001                xGPU);
1002 
1003   /* First, solve U */
1004   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1005                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1006                         upTriFactorT->csrMat->values->data().get(),
1007                         upTriFactorT->csrMat->row_offsets->data().get(),
1008                         upTriFactorT->csrMat->column_indices->data().get(),
1009                         upTriFactorT->solveInfo,
1010                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1011 
1012   /* Then, solve L */
1013   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1014                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1015                         loTriFactorT->csrMat->values->data().get(),
1016                         loTriFactorT->csrMat->row_offsets->data().get(),
1017                         loTriFactorT->csrMat->column_indices->data().get(),
1018                         loTriFactorT->solveInfo,
1019                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1020 
1021   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1022   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1023                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1024                tempGPU->begin());
1025 
1026   /* Copy the temporary to the full solution. */
1027   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1028 
1029   /* restore */
1030   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1031   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1032   ierr = WaitForGPU();CHKERRCUDA(ierr);
1033   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1034   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1035   PetscFunctionReturn(0);
1036 }
1037 
1038 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1039 {
1040   const PetscScalar                 *barray;
1041   PetscScalar                       *xarray;
1042   cusparseStatus_t                  stat;
1043   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1044   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1045   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1046   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1047   PetscErrorCode                    ierr;
1048 
1049   PetscFunctionBegin;
1050   /* Analyze the matrix and create the transpose ... on the fly */
1051   if (!loTriFactorT && !upTriFactorT) {
1052     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1053     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1054     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1055   }
1056 
1057   /* Get the GPU pointers */
1058   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1059   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1060 
1061   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1062   /* First, solve U */
1063   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1064                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1065                         upTriFactorT->csrMat->values->data().get(),
1066                         upTriFactorT->csrMat->row_offsets->data().get(),
1067                         upTriFactorT->csrMat->column_indices->data().get(),
1068                         upTriFactorT->solveInfo,
1069                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1070 
1071   /* Then, solve L */
1072   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1074                         loTriFactorT->csrMat->values->data().get(),
1075                         loTriFactorT->csrMat->row_offsets->data().get(),
1076                         loTriFactorT->csrMat->column_indices->data().get(),
1077                         loTriFactorT->solveInfo,
1078                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1079 
1080   /* restore */
1081   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1082   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1083   ierr = WaitForGPU();CHKERRCUDA(ierr);
1084   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1085   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1090 {
1091   const PetscScalar                     *barray;
1092   PetscScalar                           *xarray;
1093   thrust::device_ptr<const PetscScalar> bGPU;
1094   thrust::device_ptr<PetscScalar>       xGPU;
1095   cusparseStatus_t                      stat;
1096   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1097   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1098   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1099   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1100   PetscErrorCode                        ierr;
1101 
1102   PetscFunctionBegin;
1103 
1104   /* Get the GPU pointers */
1105   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1106   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1107   xGPU = thrust::device_pointer_cast(xarray);
1108   bGPU = thrust::device_pointer_cast(barray);
1109 
1110   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1111   /* First, reorder with the row permutation */
1112   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1113                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1114                xGPU);
1115 
1116   /* Next, solve L */
1117   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1118                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1119                         loTriFactor->csrMat->values->data().get(),
1120                         loTriFactor->csrMat->row_offsets->data().get(),
1121                         loTriFactor->csrMat->column_indices->data().get(),
1122                         loTriFactor->solveInfo,
1123                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1124 
1125   /* Then, solve U */
1126   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1127                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1128                         upTriFactor->csrMat->values->data().get(),
1129                         upTriFactor->csrMat->row_offsets->data().get(),
1130                         upTriFactor->csrMat->column_indices->data().get(),
1131                         upTriFactor->solveInfo,
1132                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1133 
1134   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1135   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1136                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1137                tempGPU->begin());
1138 
1139   /* Copy the temporary to the full solution. */
1140   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1141 
1142   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1143   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1144   ierr = WaitForGPU();CHKERRCUDA(ierr);
1145   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1146   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1151 {
1152   const PetscScalar                 *barray;
1153   PetscScalar                       *xarray;
1154   cusparseStatus_t                  stat;
1155   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1156   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1157   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1158   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1159   PetscErrorCode                    ierr;
1160 
1161   PetscFunctionBegin;
1162   /* Get the GPU pointers */
1163   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1164   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1165 
1166   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1167   /* First, solve L */
1168   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1169                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1170                         loTriFactor->csrMat->values->data().get(),
1171                         loTriFactor->csrMat->row_offsets->data().get(),
1172                         loTriFactor->csrMat->column_indices->data().get(),
1173                         loTriFactor->solveInfo,
1174                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1175 
1176   /* Next, solve U */
1177   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1178                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1179                         upTriFactor->csrMat->values->data().get(),
1180                         upTriFactor->csrMat->row_offsets->data().get(),
1181                         upTriFactor->csrMat->column_indices->data().get(),
1182                         upTriFactor->solveInfo,
1183                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1184 
1185   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1186   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1187   ierr = WaitForGPU();CHKERRCUDA(ierr);
1188   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1189   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1190   PetscFunctionReturn(0);
1191 }
1192 
1193 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1194 {
1195   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1196   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1197   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1198   PetscInt                     m = A->rmap->n,*ii,*ridx;
1199   PetscErrorCode               ierr;
1200   cusparseStatus_t             stat;
1201   cudaError_t                  err;
1202 
1203   PetscFunctionBegin;
1204   if (A->pinnedtocpu) PetscFunctionReturn(0);
1205   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1206     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1207     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1208       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1209       /* copy values only */
1210       matrix->values->assign(a->a, a->a+a->nz);
1211       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1212     } else {
1213       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1214       try {
1215         cusparsestruct->nonzerorow=0;
1216         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1217 
1218         if (a->compressedrow.use) {
1219           m    = a->compressedrow.nrows;
1220           ii   = a->compressedrow.i;
1221           ridx = a->compressedrow.rindex;
1222         } else {
1223           /* Forcing compressed row on the GPU */
1224           int k=0;
1225           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1226           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1227           ii[0]=0;
1228           for (int j = 0; j<m; j++) {
1229             if ((a->i[j+1]-a->i[j])>0) {
1230               ii[k]  = a->i[j];
1231               ridx[k]= j;
1232               k++;
1233             }
1234           }
1235           ii[cusparsestruct->nonzerorow] = a->nz;
1236           m = cusparsestruct->nonzerorow;
1237         }
1238 
1239         /* allocate space for the triangular factor information */
1240         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1241         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1242         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1243         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1244 
1245         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1246         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1247         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1248         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1249         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1250         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1251         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1252 
1253         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1254         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1255           /* set the matrix */
1256           CsrMatrix *matrix= new CsrMatrix;
1257           matrix->num_rows = m;
1258           matrix->num_cols = A->cmap->n;
1259           matrix->num_entries = a->nz;
1260           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1261           matrix->row_offsets->assign(ii, ii + m+1);
1262 
1263           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1264           matrix->column_indices->assign(a->j, a->j+a->nz);
1265 
1266           matrix->values = new THRUSTARRAY(a->nz);
1267           matrix->values->assign(a->a, a->a+a->nz);
1268 
1269           /* assign the pointer */
1270           matstruct->mat = matrix;
1271 
1272         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1273           CsrMatrix *matrix= new CsrMatrix;
1274           matrix->num_rows = m;
1275           matrix->num_cols = A->cmap->n;
1276           matrix->num_entries = a->nz;
1277           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1278           matrix->row_offsets->assign(ii, ii + m+1);
1279 
1280           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1281           matrix->column_indices->assign(a->j, a->j+a->nz);
1282 
1283           matrix->values = new THRUSTARRAY(a->nz);
1284           matrix->values->assign(a->a, a->a+a->nz);
1285 
1286           cusparseHybMat_t hybMat;
1287           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1288           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1289             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1290           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1291               matstruct->descr, matrix->values->data().get(),
1292               matrix->row_offsets->data().get(),
1293               matrix->column_indices->data().get(),
1294               hybMat, 0, partition);CHKERRCUDA(stat);
1295           /* assign the pointer */
1296           matstruct->mat = hybMat;
1297 
1298           if (matrix) {
1299             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1300             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1301             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1302             delete (CsrMatrix*)matrix;
1303           }
1304         }
1305 
1306         /* assign the compressed row indices */
1307         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1308         matstruct->cprowIndices->assign(ridx,ridx+m);
1309         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1310 
1311         /* assign the pointer */
1312         cusparsestruct->mat = matstruct;
1313 
1314         if (!a->compressedrow.use) {
1315           ierr = PetscFree(ii);CHKERRQ(ierr);
1316           ierr = PetscFree(ridx);CHKERRQ(ierr);
1317         }
1318         cusparsestruct->workVector = new THRUSTARRAY(m);
1319       } catch(char *ex) {
1320         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1321       }
1322       cusparsestruct->nonzerostate = A->nonzerostate;
1323     }
1324     ierr = WaitForGPU();CHKERRCUDA(ierr);
1325     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1326     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1327   }
1328   PetscFunctionReturn(0);
1329 }
1330 
1331 struct VecCUDAPlusEquals
1332 {
1333   template <typename Tuple>
1334   __host__ __device__
1335   void operator()(Tuple t)
1336   {
1337     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1338   }
1339 };
1340 
1341 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1342 {
1343   PetscErrorCode ierr;
1344 
1345   PetscFunctionBegin;
1346   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1351 {
1352   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1353   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1354   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1355   const PetscScalar            *xarray;
1356   PetscScalar                  *yarray;
1357   PetscErrorCode               ierr;
1358   cusparseStatus_t             stat;
1359 
1360   PetscFunctionBegin;
1361   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1362   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1363   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1364   if (!matstructT) {
1365     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1366     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1367   }
1368   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1369   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1370 
1371   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1372   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1373     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1374     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1375                              mat->num_rows, mat->num_cols,
1376                              mat->num_entries, matstructT->alpha, matstructT->descr,
1377                              mat->values->data().get(), mat->row_offsets->data().get(),
1378                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1379                              yarray);CHKERRCUDA(stat);
1380   } else {
1381     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1382     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1383                              matstructT->alpha, matstructT->descr, hybMat,
1384                              xarray, matstructT->beta_zero,
1385                              yarray);CHKERRCUDA(stat);
1386   }
1387   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1388   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1389   if (!cusparsestruct->stream) {
1390     ierr = WaitForGPU();CHKERRCUDA(ierr);
1391   }
1392   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1393   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1394   PetscFunctionReturn(0);
1395 }
1396 
1397 
1398 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1399 {
1400   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1401   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1402   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1403   const PetscScalar            *xarray;
1404   PetscScalar                  *zarray,*dptr,*beta;
1405   PetscErrorCode               ierr;
1406   cusparseStatus_t             stat;
1407   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1408 
1409   PetscFunctionBegin;
1410   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1411   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1412   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1413   try {
1414     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1415     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1416     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1417       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1418     } else {
1419       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1420     }
1421     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1422     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1423 
1424     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1425     /* csr_spmv is multiply add */
1426     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1427       /* here we need to be careful to set the number of rows in the multiply to the
1428          number of compressed rows in the matrix ... which is equivalent to the
1429          size of the workVector */
1430       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1431       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1432                                mat->num_rows, mat->num_cols,
1433                                mat->num_entries, matstruct->alpha, matstruct->descr,
1434                                mat->values->data().get(), mat->row_offsets->data().get(),
1435                                mat->column_indices->data().get(), xarray, beta,
1436                                dptr);CHKERRCUDA(stat);
1437     } else {
1438       if (cusparsestruct->workVector->size()) {
1439 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1440         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1441                                  matstruct->alpha, matstruct->descr, hybMat,
1442                                  xarray, beta,
1443                                  dptr);CHKERRCUDA(stat);
1444       }
1445     }
1446     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1447 
1448     if (yy) {
1449       if (dptr != zarray) {
1450         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1451       } else if (zz != yy) {
1452         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1453       }
1454     } else if (dptr != zarray) {
1455       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1456     }
1457     /* scatter the data from the temporary into the full vector with a += operation */
1458     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1459     if (dptr != zarray) {
1460       thrust::device_ptr<PetscScalar> zptr;
1461 
1462       zptr = thrust::device_pointer_cast(zarray);
1463       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1464                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1465                        VecCUDAPlusEquals());
1466     }
1467     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1468     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1469     if (yy && !cmpr) {
1470       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1471     } else {
1472       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1473     }
1474   } catch(char *ex) {
1475     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1476   }
1477   if (!yy) { /* MatMult */
1478     if (!cusparsestruct->stream) {
1479       ierr = WaitForGPU();CHKERRCUDA(ierr);
1480     }
1481   }
1482   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1483   PetscFunctionReturn(0);
1484 }
1485 
1486 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1487 {
1488   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1489   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1490   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1491   thrust::device_ptr<PetscScalar> zptr;
1492   const PetscScalar               *xarray;
1493   PetscScalar                     *zarray;
1494   PetscErrorCode                  ierr;
1495   cusparseStatus_t                stat;
1496 
1497   PetscFunctionBegin;
1498   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1499   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1500   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1501   if (!matstructT) {
1502     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1503     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1504     ierr = WaitForGPU();CHKERRCUDA(ierr);
1505   }
1506 
1507   try {
1508     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1509     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1510     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1511     zptr = thrust::device_pointer_cast(zarray);
1512 
1513     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1514     /* multiply add with matrix transpose */
1515     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1516       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1517       /* here we need to be careful to set the number of rows in the multiply to the
1518          number of compressed rows in the matrix ... which is equivalent to the
1519          size of the workVector */
1520       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1521                                mat->num_rows, mat->num_cols,
1522                                mat->num_entries, matstructT->alpha, matstructT->descr,
1523                                mat->values->data().get(), mat->row_offsets->data().get(),
1524                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1525                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1526     } else {
1527       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1528       if (cusparsestruct->workVector->size()) {
1529         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1530             matstructT->alpha, matstructT->descr, hybMat,
1531             xarray, matstructT->beta_zero,
1532             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1533       }
1534     }
1535 
1536     /* scatter the data from the temporary into the full vector with a += operation */
1537     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1538         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1539         VecCUDAPlusEquals());
1540     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1541     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1542     ierr = WaitForGPU();CHKERRCUDA(ierr);
1543     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1544 
1545   } catch(char *ex) {
1546     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1547     ierr = WaitForGPU();CHKERRCUDA(ierr);
1548   }
1549   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1554 {
1555   PetscErrorCode ierr;
1556 
1557   PetscFunctionBegin;
1558   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1559   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1560   if (A->factortype == MAT_FACTOR_NONE) {
1561     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1562   }
1563   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1564   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1565   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1566   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 /* --------------------------------------------------------------------------------*/
1571 /*@
1572    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1573    (the default parallel PETSc format). This matrix will ultimately pushed down
1574    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1575    assembly performance the user should preallocate the matrix storage by setting
1576    the parameter nz (or the array nnz).  By setting these parameters accurately,
1577    performance during matrix assembly can be increased by more than a factor of 50.
1578 
1579    Collective
1580 
1581    Input Parameters:
1582 +  comm - MPI communicator, set to PETSC_COMM_SELF
1583 .  m - number of rows
1584 .  n - number of columns
1585 .  nz - number of nonzeros per row (same for all rows)
1586 -  nnz - array containing the number of nonzeros in the various rows
1587          (possibly different for each row) or NULL
1588 
1589    Output Parameter:
1590 .  A - the matrix
1591 
1592    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1593    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1594    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1595 
1596    Notes:
1597    If nnz is given then nz is ignored
1598 
1599    The AIJ format (also called the Yale sparse matrix format or
1600    compressed row storage), is fully compatible with standard Fortran 77
1601    storage.  That is, the stored row and column indices can begin at
1602    either one (as in Fortran) or zero.  See the users' manual for details.
1603 
1604    Specify the preallocated storage with either nz or nnz (not both).
1605    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1606    allocation.  For large problems you MUST preallocate memory or you
1607    will get TERRIBLE performance, see the users' manual chapter on matrices.
1608 
1609    By default, this format uses inodes (identical nodes) when possible, to
1610    improve numerical efficiency of matrix-vector products and solves. We
1611    search for consecutive rows with the same nonzero structure, thereby
1612    reusing matrix information to achieve increased efficiency.
1613 
1614    Level: intermediate
1615 
1616 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1617 @*/
1618 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1619 {
1620   PetscErrorCode ierr;
1621 
1622   PetscFunctionBegin;
1623   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1624   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1625   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1626   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1631 {
1632   PetscErrorCode   ierr;
1633 
1634   PetscFunctionBegin;
1635   if (A->factortype==MAT_FACTOR_NONE) {
1636     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1637       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1638     }
1639   } else {
1640     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1641   }
1642   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1647 {
1648   PetscErrorCode ierr;
1649   Mat C;
1650   cusparseStatus_t stat;
1651   cusparseHandle_t handle=0;
1652 
1653   PetscFunctionBegin;
1654   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1655   C    = *B;
1656   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1657   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1658 
1659   /* inject CUSPARSE-specific stuff */
1660   if (C->factortype==MAT_FACTOR_NONE) {
1661     /* you cannot check the inode.use flag here since the matrix was just created.
1662        now build a GPU matrix data structure */
1663     C->spptr = new Mat_SeqAIJCUSPARSE;
1664     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1665     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1666     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1667     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1668     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1669     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1670     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1671     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1672   } else {
1673     /* NEXT, set the pointers to the triangular factors */
1674     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1675     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1676     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1677     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1678     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1679     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1680     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1681     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1682     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1683     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1684     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1685     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1686   }
1687 
1688   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1689   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1690   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1691   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1692   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1693   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1694   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1695   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1696 
1697   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1698 
1699   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1700 
1701   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1706 {
1707   PetscErrorCode ierr;
1708   cusparseStatus_t stat;
1709   cusparseHandle_t handle=0;
1710 
1711   PetscFunctionBegin;
1712   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1713   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1714 
1715   if (B->factortype==MAT_FACTOR_NONE) {
1716     /* you cannot check the inode.use flag here since the matrix was just created.
1717        now build a GPU matrix data structure */
1718     B->spptr = new Mat_SeqAIJCUSPARSE;
1719     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1720     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1721     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1722     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1723     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1724     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1725     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1726     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1727   } else {
1728     /* NEXT, set the pointers to the triangular factors */
1729     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1731     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1733     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1737     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1738     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1739     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1740   }
1741 
1742   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1743   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1744   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1745   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1746   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1747   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1748   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1749   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1750 
1751   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1752 
1753   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1754 
1755   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1756   PetscFunctionReturn(0);
1757 }
1758 
1759 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1760 {
1761   PetscErrorCode ierr;
1762 
1763   PetscFunctionBegin;
1764   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1765   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1766   PetscFunctionReturn(0);
1767 }
1768 
1769 /*MC
1770    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1771 
1772    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1773    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1774    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1775 
1776    Options Database Keys:
1777 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1778 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1779 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1780 
1781   Level: beginner
1782 
1783 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1784 M*/
1785 
1786 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1787 
1788 
1789 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1790 {
1791   PetscErrorCode ierr;
1792 
1793   PetscFunctionBegin;
1794   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1795   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1796   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1797   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1798   PetscFunctionReturn(0);
1799 }
1800 
1801 
1802 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1803 {
1804   cusparseStatus_t stat;
1805   cusparseHandle_t handle;
1806 
1807   PetscFunctionBegin;
1808   if (*cusparsestruct) {
1809     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1810     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1811     delete (*cusparsestruct)->workVector;
1812     if (handle = (*cusparsestruct)->handle) {
1813       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1814     }
1815     delete *cusparsestruct;
1816     *cusparsestruct = 0;
1817   }
1818   PetscFunctionReturn(0);
1819 }
1820 
1821 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1822 {
1823   PetscFunctionBegin;
1824   if (*mat) {
1825     delete (*mat)->values;
1826     delete (*mat)->column_indices;
1827     delete (*mat)->row_offsets;
1828     delete *mat;
1829     *mat = 0;
1830   }
1831   PetscFunctionReturn(0);
1832 }
1833 
1834 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1835 {
1836   cusparseStatus_t stat;
1837   PetscErrorCode   ierr;
1838 
1839   PetscFunctionBegin;
1840   if (*trifactor) {
1841     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1842     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1843     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1844     delete *trifactor;
1845     *trifactor = 0;
1846   }
1847   PetscFunctionReturn(0);
1848 }
1849 
1850 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1851 {
1852   CsrMatrix        *mat;
1853   cusparseStatus_t stat;
1854   cudaError_t      err;
1855 
1856   PetscFunctionBegin;
1857   if (*matstruct) {
1858     if ((*matstruct)->mat) {
1859       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1860         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1861         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1862       } else {
1863         mat = (CsrMatrix*)(*matstruct)->mat;
1864         CsrMatrix_Destroy(&mat);
1865       }
1866     }
1867     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1868     delete (*matstruct)->cprowIndices;
1869     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1870     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1871     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1872     delete *matstruct;
1873     *matstruct = 0;
1874   }
1875   PetscFunctionReturn(0);
1876 }
1877 
1878 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1879 {
1880   cusparseHandle_t handle;
1881   cusparseStatus_t stat;
1882 
1883   PetscFunctionBegin;
1884   if (*trifactors) {
1885     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1886     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1887     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1888     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1889     delete (*trifactors)->rpermIndices;
1890     delete (*trifactors)->cpermIndices;
1891     delete (*trifactors)->workVector;
1892     if (handle = (*trifactors)->handle) {
1893       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1894     }
1895     delete *trifactors;
1896     *trifactors = 0;
1897   }
1898   PetscFunctionReturn(0);
1899 }
1900 
1901