xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision be33224567306cb44901cfc55a41c9126afd0e6e)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 /* this is such a hack ... but I don't know of another way to pass this variable
18    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19    SpMV. Essentially, I need to use the same stream variable in two different
20    data structures. I do this by creating a single instance of that stream
21    and reuse it. */
22 cudaStream_t theBodyStream=0;
23 
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 
36 #undef __FUNCT__
37 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
38 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
39 {
40   PetscFunctionBegin;
41   *type = MATSOLVERCUSPARSE;
42   PetscFunctionReturn(0);
43 }
44 
45 EXTERN_C_BEGIN
46 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
47 EXTERN_C_END
48 /*
49   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
50   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
51 
52    Level: beginner
53 */
54 
55 EXTERN_C_BEGIN
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
58 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
59 {
60   PetscErrorCode     ierr;
61 
62   PetscFunctionBegin;
63   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
64   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
65     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
66     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
67     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
68     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
69     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
70   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
71   (*B)->factortype = ftype;
72   PetscFunctionReturn(0);
73 }
74 EXTERN_C_END
75 
76 EXTERN_C_BEGIN
77 #undef __FUNCT__
78 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
79 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
80 {
81   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
82 
83   PetscFunctionBegin;
84   switch (op) {
85   case MAT_CUSPARSE_MULT:
86     cusparseMat->format = format;
87     break;
88   case MAT_CUSPARSE_SOLVE:
89     cusparseMatSolveStorageFormat = format;
90     break;
91   case MAT_CUSPARSE_ALL:
92     cusparseMat->format = format;
93     cusparseMatSolveStorageFormat = format;
94     break;
95   default:
96     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
97   }
98   PetscFunctionReturn(0);
99 }
100 EXTERN_C_END
101 
102 
103 /*@
104    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
105    operation. Only the MatMult operation can use different GPU storage formats
106    for AIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
107    to build/install PETSc to use this package.
108 
109    Not Collective
110 
111    Input Parameters:
112 +  A : Matrix of type SEQAIJCUSPARSE
113 .  op : MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
114 -  format : MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
115 
116    Output Parameter:
117 
118    Level: intermediate
119 
120 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEARSEFormatOperation
121 @*/
122 #undef __FUNCT__
123 #define __FUNCT__ "MatCUSPARSESetFormat"
124 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   PetscErrorCode ierr;
127 
128   PetscFunctionBegin;
129   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
130   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
131   PetscFunctionReturn(0);
132 }
133 
134 #undef __FUNCT__
135 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
136 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
137 {
138   PetscErrorCode     ierr;
139   MatCUSPARSEStorageFormat format;
140   PetscBool      flg;
141 
142   PetscFunctionBegin;
143   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
144   ierr = PetscObjectOptionsBegin((PetscObject)A);
145   if (A->factortype==MAT_FACTOR_NONE) {
146     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
147                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
148     if (flg) {
149       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
150     }
151   }
152   else {
153     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
154                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
155     if (flg) {
156       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
157     }
158   }
159   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
160                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
161   if (flg) {
162     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
163   }
164   ierr = PetscOptionsEnd();CHKERRQ(ierr);
165   PetscFunctionReturn(0);
166 
167 }
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
171 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
172 {
173   PetscErrorCode     ierr;
174 
175   PetscFunctionBegin;
176   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
177   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
178   PetscFunctionReturn(0);
179 }
180 
181 #undef __FUNCT__
182 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
183 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
184 {
185   PetscErrorCode     ierr;
186 
187   PetscFunctionBegin;
188   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
189   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
190   PetscFunctionReturn(0);
191 }
192 
193 #undef __FUNCT__
194 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
195 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
196 {
197   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
198   PetscInt          n = A->rmap->n;
199   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
200   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
201   cusparseStatus_t stat;
202   const PetscInt    *ai = a->i,*aj = a->j,*vi;
203   const MatScalar   *aa = a->a,*v;
204   PetscErrorCode     ierr;
205   PetscInt *AiLo, *AjLo;
206   PetscScalar *AALo;
207   PetscInt i,nz, nzLower, offset, rowOffset;
208 
209   PetscFunctionBegin;
210   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
211     try {
212       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
213       nzLower=n+ai[n]-ai[1];
214 
215       /* Allocate Space for the lower triangular matrix */
216       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
217       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
218       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
219 
220       /* Fill the lower triangular matrix */
221       AiLo[0]=(PetscInt) 0;
222       AiLo[n]=nzLower;
223       AjLo[0]=(PetscInt) 0;
224       AALo[0]=(MatScalar) 1.0;
225       v    = aa;
226       vi   = aj;
227       offset=1;
228       rowOffset=1;
229       for (i=1; i<n; i++) {
230         nz  = ai[i+1] - ai[i];
231         /* additional 1 for the term on the diagonal */
232         AiLo[i]=rowOffset;
233         rowOffset+=nz+1;
234 
235         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
236         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
237 
238         offset+=nz;
239         AjLo[offset]=(PetscInt) i;
240         AALo[offset]=(MatScalar) 1.0;
241         offset+=1;
242 
243         v  += nz;
244         vi += nz;
245       }
246       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
247       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
248       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
249       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
250       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
251       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
252       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
253       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
254     } catch(char* ex) {
255       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
256     }
257   }
258   PetscFunctionReturn(0);
259 }
260 
261 #undef __FUNCT__
262 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
263 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
264 {
265   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
266   PetscInt          n = A->rmap->n;
267   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
268   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
269   cusparseStatus_t stat;
270   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
271   const MatScalar   *aa = a->a,*v;
272   PetscInt *AiUp, *AjUp;
273   PetscScalar *AAUp;
274   PetscInt i,nz, nzUpper, offset;
275   PetscErrorCode     ierr;
276 
277   PetscFunctionBegin;
278   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
279     try {
280       /* next, figure out the number of nonzeros in the upper triangular matrix. */
281       nzUpper = adiag[0]-adiag[n];
282 
283       /* Allocate Space for the upper triangular matrix */
284       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
285       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
286       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
287 
288       /* Fill the upper triangular matrix */
289       AiUp[0]=(PetscInt) 0;
290       AiUp[n]=nzUpper;
291       offset = nzUpper;
292       for (i=n-1; i>=0; i--) {
293         v   = aa + adiag[i+1] + 1;
294         vi  = aj + adiag[i+1] + 1;
295 
296         /* number of elements NOT on the diagonal */
297         nz = adiag[i] - adiag[i+1]-1;
298 
299         /* decrement the offset */
300         offset -= (nz+1);
301 
302         /* first, set the diagonal elements */
303         AjUp[offset] = (PetscInt) i;
304         AAUp[offset] = 1./v[nz];
305         AiUp[i] = AiUp[i+1] - (nz+1);
306 
307         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
308         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
309       }
310       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
311       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
312       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
313       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
314       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
315       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
316       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
317       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
318     } catch(char* ex) {
319       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
320     }
321   }
322   PetscFunctionReturn(0);
323 }
324 
325 #undef __FUNCT__
326 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
327 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
328 {
329   PetscErrorCode     ierr;
330   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
332   IS               isrow = a->row,iscol = a->icol;
333   PetscBool        row_identity,col_identity;
334   const PetscInt   *r,*c;
335   PetscInt          n = A->rmap->n;
336 
337   PetscFunctionBegin;
338   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
339   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
340   cusparseTriFactors->tempvec = new CUSPARRAY;
341   cusparseTriFactors->tempvec->resize(n);
342 
343   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
344   /*lower triangular indices */
345   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
346   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
347   if (!row_identity)
348     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
349   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
350 
351   /*upper triangular indices */
352   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
353   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
354   if (!col_identity)
355     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
356   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
357   PetscFunctionReturn(0);
358 }
359 
360 #undef __FUNCT__
361 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
362 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
363 {
364   PetscErrorCode   ierr;
365   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
366   IS               isrow = b->row,iscol = b->col;
367   PetscBool        row_identity,col_identity;
368 
369   PetscFunctionBegin;
370   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
371   /* determine which version of MatSolve needs to be used. */
372   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
373   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
374   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
375   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
376 
377   /* get the triangular factors */
378   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
379   PetscFunctionReturn(0);
380 }
381 
382 
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
386 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
387 {
388   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
389   PetscErrorCode ierr;
390   CUSPARRAY      *xGPU=PETSC_NULL, *bGPU=PETSC_NULL;
391   cusparseStatus_t stat;
392   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
393   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
394   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
395   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
396 
397   PetscFunctionBegin;
398   /* Get the GPU pointers */
399   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
400   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
401 
402   /* solve with reordering */
403   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
404   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
405   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
406   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
407 
408   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
409   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
410   ierr = WaitForGPU();CHKERRCUSP(ierr);
411   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
412   PetscFunctionReturn(0);
413 }
414 
415 
416 
417 
418 #undef __FUNCT__
419 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
420 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
421 {
422   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
423   PetscErrorCode    ierr;
424   CUSPARRAY         *xGPU=PETSC_NULL, *bGPU=PETSC_NULL;
425   cusparseStatus_t stat;
426   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
427   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
428   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
429   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
430 
431   PetscFunctionBegin;
432   /* Get the GPU pointers */
433   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
434   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
435 
436   /* solve */
437   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
438   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
439 
440   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
441   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
442   ierr = WaitForGPU();CHKERRCUSP(ierr);
443   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
444   PetscFunctionReturn(0);
445 }
446 
447 #undef __FUNCT__
448 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
449 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
450 {
451 
452   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
453   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
454   PetscInt        m           = A->rmap->n,*ii,*ridx;
455   PetscErrorCode  ierr;
456 
457 
458   PetscFunctionBegin;
459   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
460     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
461     /*
462       It may be possible to reuse nonzero structure with new matrix values but
463       for simplicity and insured correctness we delete and build a new matrix on
464       the GPU. Likely a very small performance hit.
465     */
466     if (cusparseMat->mat) {
467       try {
468         delete cusparseMat->mat;
469         if (cusparseMat->tempvec)
470           delete cusparseMat->tempvec;
471 
472       } catch(char* ex) {
473         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
474       }
475     }
476     try {
477       cusparseMat->nonzerorow=0;
478       for (int j = 0; j<m; j++)
479         cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
480 
481       if (a->compressedrow.use) {
482         m    = a->compressedrow.nrows;
483         ii   = a->compressedrow.i;
484         ridx = a->compressedrow.rindex;
485       } else {
486         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
487         int k=0;
488         ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
489         ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
490         ii[0]=0;
491         for (int j = 0; j<m; j++) {
492           if ((a->i[j+1]-a->i[j])>0) {
493             ii[k] = a->i[j];
494             ridx[k]= j;
495             k++;
496           }
497         }
498         ii[cusparseMat->nonzerorow] = a->nz;
499         m = cusparseMat->nonzerorow;
500       }
501 
502       /* Build our matrix ... first determine the GPU storage type */
503       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
504 
505       /* Create the streams and events (if desired).  */
506       PetscMPIInt    size;
507       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
508       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
509 
510       /* FILL MODE UPPER is irrelevant */
511       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
512 
513       /* lastly, build the matrix */
514       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
515       cusparseMat->mat->setCPRowIndices(ridx, m);
516       if (!a->compressedrow.use) {
517         ierr = PetscFree(ii);CHKERRQ(ierr);
518         ierr = PetscFree(ridx);CHKERRQ(ierr);
519       }
520       cusparseMat->tempvec = new CUSPARRAY;
521       cusparseMat->tempvec->resize(m);
522     } catch(char* ex) {
523       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
524     }
525     ierr = WaitForGPU();CHKERRCUSP(ierr);
526     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
527     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
528   }
529   PetscFunctionReturn(0);
530 }
531 
532 #undef __FUNCT__
533 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
534 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
535 {
536   PetscErrorCode ierr;
537 
538   PetscFunctionBegin;
539   if (right) {
540     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
541     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
542     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
543     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
544     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
545   }
546   if (left) {
547     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
548     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
549     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
550     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
551     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
552   }
553   PetscFunctionReturn(0);
554 }
555 
556 #undef __FUNCT__
557 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
558 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
559 {
560   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
561   PetscErrorCode ierr;
562   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
563   CUSPARRAY      *xarray=PETSC_NULL,*yarray=PETSC_NULL;
564 
565   PetscFunctionBegin;
566   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
567      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
568   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
569   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
570   try {
571     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
572   } catch (char* ex) {
573     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
574   }
575   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
576   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
577   if (!cusparseMat->mat->hasNonZeroStream()) {
578     ierr = WaitForGPU();CHKERRCUSP(ierr);
579   }
580   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 
584 
585 #undef __FUNCT__
586 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
587 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
588 {
589   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
590   PetscErrorCode ierr;
591   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
592   CUSPARRAY      *xarray=PETSC_NULL,*yarray=PETSC_NULL;
593 
594   PetscFunctionBegin;
595   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
596      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
597   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
598   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
599   try {
600 #if !defined(PETSC_USE_COMPLEX)
601     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
602 #else
603     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
604 #endif
605   } catch (char* ex) {
606     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
607   }
608   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
609   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
610   if (!cusparseMat->mat->hasNonZeroStream()) {
611     ierr = WaitForGPU();CHKERRCUSP(ierr);
612   }
613   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
614   PetscFunctionReturn(0);
615 }
616 
617 #undef __FUNCT__
618 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
619 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
620 {
621   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
622   PetscErrorCode ierr;
623   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
624   CUSPARRAY      *xarray=PETSC_NULL,*yarray=PETSC_NULL,*zarray=PETSC_NULL;
625 
626   PetscFunctionBegin;
627   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
628      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
629   try {
630     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
631     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
632     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
633     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
634 
635     /* multiply add */
636     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
637 
638     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
639     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
640     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
641 
642   } catch(char* ex) {
643     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
644   }
645   ierr = WaitForGPU();CHKERRCUSP(ierr);
646   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
647   PetscFunctionReturn(0);
648 }
649 
650 #undef __FUNCT__
651 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
652 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
653 {
654   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
655   PetscErrorCode ierr;
656   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
657   CUSPARRAY      *xarray,*yarray,*zarray;
658 
659   PetscFunctionBegin;
660   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
661      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
662   try {
663     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
664     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
665     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
666     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
667 
668     /* multiply add with matrix transpose */
669 #if !defined(PETSC_USE_COMPLEX)
670     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
671 #else
672     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
673 #endif
674 
675     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
676     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
677     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
678 
679   } catch(char* ex) {
680     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
681   }
682   ierr = WaitForGPU();CHKERRCUSP(ierr);
683   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
684   PetscFunctionReturn(0);
685 }
686 
687 #undef __FUNCT__
688 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
689 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
690 {
691   PetscErrorCode  ierr;
692 
693   PetscFunctionBegin;
694   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
695   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
696   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
697   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
698   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
699   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
700   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
701   PetscFunctionReturn(0);
702 }
703 
704 /* --------------------------------------------------------------------------------*/
705 #undef __FUNCT__
706 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
707 /*@
708    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
709    (the default parallel PETSc format). This matrix will ultimately pushed down
710    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
711    assembly performance the user should preallocate the matrix storage by setting
712    the parameter nz (or the array nnz).  By setting these parameters accurately,
713    performance during matrix assembly can be increased by more than a factor of 50.
714 
715    Collective on MPI_Comm
716 
717    Input Parameters:
718 +  comm - MPI communicator, set to PETSC_COMM_SELF
719 .  m - number of rows
720 .  n - number of columns
721 .  nz - number of nonzeros per row (same for all rows)
722 -  nnz - array containing the number of nonzeros in the various rows
723          (possibly different for each row) or PETSC_NULL
724 
725    Output Parameter:
726 .  A - the matrix
727 
728    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
729    MatXXXXSetPreallocation() paradgm instead of this routine directly.
730    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
731 
732    Notes:
733    If nnz is given then nz is ignored
734 
735    The AIJ format (also called the Yale sparse matrix format or
736    compressed row storage), is fully compatible with standard Fortran 77
737    storage.  That is, the stored row and column indices can begin at
738    either one (as in Fortran) or zero.  See the users' manual for details.
739 
740    Specify the preallocated storage with either nz or nnz (not both).
741    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
742    allocation.  For large problems you MUST preallocate memory or you
743    will get TERRIBLE performance, see the users' manual chapter on matrices.
744 
745    By default, this format uses inodes (identical nodes) when possible, to
746    improve numerical efficiency of matrix-vector products and solves. We
747    search for consecutive rows with the same nonzero structure, thereby
748    reusing matrix information to achieve increased efficiency.
749 
750    Level: intermediate
751 
752 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
753 @*/
754 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
755 {
756   PetscErrorCode ierr;
757 
758   PetscFunctionBegin;
759   ierr = MatCreate(comm,A);CHKERRQ(ierr);
760   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
761   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
762   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
763   PetscFunctionReturn(0);
764 }
765 
766 #undef __FUNCT__
767 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
768 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
769 {
770   PetscErrorCode        ierr;
771   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
772 
773   PetscFunctionBegin;
774   if (A->factortype==MAT_FACTOR_NONE) {
775     try {
776       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
777         delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
778       }
779       if (cusparseMat->tempvec!=0)
780         delete cusparseMat->tempvec;
781       delete cusparseMat;
782       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
783     } catch(char* ex) {
784       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
785     }
786   } else {
787     /* The triangular factors */
788     try {
789       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
790       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
791       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
792       delete (GPU_Matrix_Ifc *)cusparseMatLo;
793       delete (GPU_Matrix_Ifc *)cusparseMatUp;
794       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
795       delete cusparseTriFactors;
796     } catch(char* ex) {
797       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
798     }
799   }
800   if (MAT_cusparseHandle) {
801     cusparseStatus_t stat;
802     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
803     MAT_cusparseHandle=0;
804   }
805   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
806   A->spptr = 0;
807 
808   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
809   PetscFunctionReturn(0);
810 }
811 
812 EXTERN_C_BEGIN
813 #undef __FUNCT__
814 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
815 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
816 {
817   PetscErrorCode ierr;
818 
819   PetscFunctionBegin;
820   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
821   if (B->factortype==MAT_FACTOR_NONE) {
822     /* you cannot check the inode.use flag here since the matrix was just created.
823        now build a GPU matrix data structure */
824     B->spptr        = new Mat_SeqAIJCUSPARSE;
825     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
826     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
827     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR;
828   } else {
829     /* NEXT, set the pointers to the triangular factors */
830     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
831     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
832     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
833     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
834     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat;
835   }
836   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
837   if (!MAT_cusparseHandle) {
838     cusparseStatus_t stat;
839     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
840   }
841   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
842      default cusparse tri solve. Note the difference with the implementation in
843      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
844   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
845   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
846   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
847   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
848   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
849   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
850   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
851   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
852   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
853   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
854   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
855   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
856   PetscFunctionReturn(0);
857 }
858 EXTERN_C_END
859 
860 /*M
861    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
862 
863    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
864    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
865    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
866    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
867    the different GPU storage formats.
868 
869    Options Database Keys:
870 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
871 .  -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
872 .  -mat_cusparse_mult_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
873 -  -mat_cusparse_solve_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
874 
875   Level: beginner
876 
877 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
878 M*/
879