xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 18e2ec274b9e92144f2ef650b9690aa28f9a6737)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 /* this is such a hack ... but I don't know of another way to pass this variable
18    from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
19    SpMV. Essentially, I need to use the same stream variable in two different
20    data structures. I do this by creating a single instance of that stream
21    and reuse it. */
22 cudaStream_t theBodyStream=0;
23 
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat);
31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 
36 #undef __FUNCT__
37 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
38 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
39 {
40   PetscFunctionBegin;
41   *type = MATSOLVERCUSPARSE;
42   PetscFunctionReturn(0);
43 }
44 
45 EXTERN_C_BEGIN
46 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
47 EXTERN_C_END
48 /*
49   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
50   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
51 
52    Level: beginner
53 */
54 
55 EXTERN_C_BEGIN
56 #undef __FUNCT__
57 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
58 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
59 {
60   PetscErrorCode     ierr;
61 
62   PetscFunctionBegin;
63   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
64   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
65     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
66     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
67     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
68     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
69     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
70   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
71   (*B)->factortype = ftype;
72   PetscFunctionReturn(0);
73 }
74 EXTERN_C_END
75 
76 EXTERN_C_BEGIN
77 #undef __FUNCT__
78 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
79 PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
80 {
81   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
82   PetscFunctionBegin;
83   switch (op) {
84   case MAT_CUSPARSE_MULT:
85     cusparseMat->format = format;
86     break;
87   case MAT_CUSPARSE_SOLVE:
88     cusparseMatSolveStorageFormat = format;
89     break;
90   case MAT_CUSPARSE_ALL:
91     cusparseMat->format = format;
92     cusparseMatSolveStorageFormat = format;
93     break;
94   default:
95     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
96   }
97   PetscFunctionReturn(0);
98 }
99 EXTERN_C_END
100 
101 
102 /*@
103    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
104    operation. Only the MatMult operation can use different GPU storage formats
105    for AIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu
106    to build/install PETSc to use this package.
107 
108    Not Collective
109 
110    Input Parameters:
111 +  A : Matrix of type SEQAIJCUSPARSE
112 .  op : MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
113 -  format : MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
114 
115    Output Parameter:
116 
117    Level: intermediate
118 
119 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEARSEFormatOperation
120 @*/
121 #undef __FUNCT__
122 #define __FUNCT__ "MatCUSPARSESetFormat"
123 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
124 {
125   PetscErrorCode ierr;
126   PetscFunctionBegin;
127   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
128   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
129   PetscFunctionReturn(0);
130 }
131 
132 #undef __FUNCT__
133 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
134 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
135 {
136   PetscErrorCode     ierr;
137   MatCUSPARSEStorageFormat format;
138   PetscBool      flg;
139   PetscFunctionBegin;
140   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
141   ierr = PetscObjectOptionsBegin((PetscObject)A);
142   if (A->factortype==MAT_FACTOR_NONE) {
143     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
144 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
145     if (flg) {
146       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
147     }
148   }
149   else {
150     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
151 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
152     if (flg) {
153       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
154     }
155   }
156   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
157                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
158   if (flg) {
159     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
160   }
161   ierr = PetscOptionsEnd();CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 
164 }
165 
166 #undef __FUNCT__
167 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
168 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
169 {
170   PetscErrorCode     ierr;
171 
172   PetscFunctionBegin;
173   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
174   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
175   PetscFunctionReturn(0);
176 }
177 
178 #undef __FUNCT__
179 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
180 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
181 {
182   PetscErrorCode     ierr;
183 
184   PetscFunctionBegin;
185   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
186   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
187   PetscFunctionReturn(0);
188 }
189 
190 #undef __FUNCT__
191 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix"
192 PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A)
193 {
194   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
195   PetscInt          n = A->rmap->n;
196   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
197   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
198   cusparseStatus_t stat;
199   const PetscInt    *ai = a->i,*aj = a->j,*vi;
200   const MatScalar   *aa = a->a,*v;
201   PetscErrorCode     ierr;
202   PetscInt *AiLo, *AjLo;
203   PetscScalar *AALo;
204   PetscInt i,nz, nzLower, offset, rowOffset;
205 
206   PetscFunctionBegin;
207   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
208     try {
209       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
210       nzLower=n+ai[n]-ai[1];
211 
212       /* Allocate Space for the lower triangular matrix */
213       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
214       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
215       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
216 
217       /* Fill the lower triangular matrix */
218       AiLo[0]=(PetscInt) 0;
219       AiLo[n]=nzLower;
220       AjLo[0]=(PetscInt) 0;
221       AALo[0]=(MatScalar) 1.0;
222       v    = aa;
223       vi   = aj;
224       offset=1;
225       rowOffset=1;
226       for (i=1; i<n; i++) {
227 	nz  = ai[i+1] - ai[i];
228 	/* additional 1 for the term on the diagonal */
229 	AiLo[i]=rowOffset;
230 	rowOffset+=nz+1;
231 
232 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
233 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
234 
235 	offset+=nz;
236 	AjLo[offset]=(PetscInt) i;
237 	AALo[offset]=(MatScalar) 1.0;
238 	offset+=1;
239 
240 	v  += nz;
241 	vi += nz;
242       }
243       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
244       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
245       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
246       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
247       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
248       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
249       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
250       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
251     } catch(char* ex) {
252       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
253     }
254   }
255   PetscFunctionReturn(0);
256 }
257 
258 #undef __FUNCT__
259 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix"
260 PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A)
261 {
262   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
263   PetscInt          n = A->rmap->n;
264   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
265   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
266   cusparseStatus_t stat;
267   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
268   const MatScalar   *aa = a->a,*v;
269   PetscInt *AiUp, *AjUp;
270   PetscScalar *AAUp;
271   PetscInt i,nz, nzUpper, offset;
272   PetscErrorCode     ierr;
273 
274   PetscFunctionBegin;
275 
276   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
277     try {
278       /* next, figure out the number of nonzeros in the upper triangular matrix. */
279       nzUpper = adiag[0]-adiag[n];
280 
281       /* Allocate Space for the upper triangular matrix */
282       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
283       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
284       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
285 
286       /* Fill the upper triangular matrix */
287       AiUp[0]=(PetscInt) 0;
288       AiUp[n]=nzUpper;
289       offset = nzUpper;
290       for (i=n-1; i>=0; i--){
291 	v   = aa + adiag[i+1] + 1;
292 	vi  = aj + adiag[i+1] + 1;
293 
294 	/* number of elements NOT on the diagonal */
295 	nz = adiag[i] - adiag[i+1]-1;
296 
297 	/* decrement the offset */
298 	offset -= (nz+1);
299 
300 	/* first, set the diagonal elements */
301 	AjUp[offset] = (PetscInt) i;
302 	AAUp[offset] = 1./v[nz];
303 	AiUp[i] = AiUp[i+1] - (nz+1);
304 
305 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
306 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
307       }
308       cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]);
309       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
310       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
311       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
312       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
313       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
314       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
315       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
316     } catch(char* ex) {
317       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
318     }
319   }
320   PetscFunctionReturn(0);
321 }
322 
323 #undef __FUNCT__
324 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysisAndCopyToGPU"
325 PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A)
326 {
327   PetscErrorCode     ierr;
328   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
329   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
330   IS               isrow = a->row,iscol = a->icol;
331   PetscBool        row_identity,col_identity;
332   const PetscInt   *r,*c;
333   PetscInt          n = A->rmap->n;
334 
335   PetscFunctionBegin;
336   ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
337   ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
338   cusparseTriFactors->tempvec = new CUSPARRAY;
339   cusparseTriFactors->tempvec->resize(n);
340 
341   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
342   /*lower triangular indices */
343   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
344   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
345   if (!row_identity)
346     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
347   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
348 
349   /*upper triangular indices */
350   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
351   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
352   if (!col_identity)
353     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
354   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
355   PetscFunctionReturn(0);
356 }
357 
358 #undef __FUNCT__
359 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
360 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
361 {
362   PetscErrorCode   ierr;
363   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
364   IS               isrow = b->row,iscol = b->col;
365   PetscBool        row_identity,col_identity;
366 
367   PetscFunctionBegin;
368   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
369   /* determine which version of MatSolve needs to be used. */
370   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
371   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
372   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
373   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
374 
375   /* get the triangular factors */
376   ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 
381 
382 #undef __FUNCT__
383 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
384 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
385 {
386   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
387   PetscErrorCode ierr;
388   CUSPARRAY      *xGPU, *bGPU;
389   cusparseStatus_t stat;
390   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
391   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
392   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
393   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
394 
395   PetscFunctionBegin;
396   /* Get the GPU pointers */
397   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
398   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
399 
400   /* solve with reordering */
401   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
402   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
403   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
404   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
405 
406   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
407   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
408   ierr = WaitForGPU();CHKERRCUSP(ierr);
409   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
410   PetscFunctionReturn(0);
411 }
412 
413 
414 
415 
416 #undef __FUNCT__
417 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
418 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
419 {
420   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
421   PetscErrorCode    ierr;
422   CUSPARRAY         *xGPU, *bGPU;
423   cusparseStatus_t stat;
424   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
425   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
426   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
427   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
428 
429   PetscFunctionBegin;
430   /* Get the GPU pointers */
431   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
432   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
433 
434   /* solve */
435   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
436   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
437 
438   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
439   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
440   ierr = WaitForGPU();CHKERRCUSP(ierr);
441   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
442   PetscFunctionReturn(0);
443 }
444 
445 #undef __FUNCT__
446 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
447 PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
448 {
449 
450   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
451   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
452   PetscInt        m           = A->rmap->n,*ii,*ridx;
453   PetscErrorCode  ierr;
454 
455 
456   PetscFunctionBegin;
457   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
458     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
459     /*
460       It may be possible to reuse nonzero structure with new matrix values but
461       for simplicity and insured correctness we delete and build a new matrix on
462       the GPU. Likely a very small performance hit.
463     */
464     if (cusparseMat->mat){
465       try {
466 	delete cusparseMat->mat;
467 	if (cusparseMat->tempvec)
468 	  delete cusparseMat->tempvec;
469 
470       } catch(char* ex) {
471 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
472       }
473     }
474     try {
475       cusparseMat->nonzerorow=0;
476       for (int j = 0; j<m; j++)
477 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
478 
479       if (a->compressedrow.use) {
480 	m    = a->compressedrow.nrows;
481 	ii   = a->compressedrow.i;
482 	ridx = a->compressedrow.rindex;
483       } else {
484 	/* Forcing compressed row on the GPU ... only relevant for CSR storage */
485 	int k=0;
486 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
487 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
488 	ii[0]=0;
489 	for (int j = 0; j<m; j++) {
490 	  if ((a->i[j+1]-a->i[j])>0) {
491 	    ii[k] = a->i[j];
492 	    ridx[k]= j;
493 	    k++;
494 	  }
495 	}
496 	ii[cusparseMat->nonzerorow] = a->nz;
497 	m = cusparseMat->nonzerorow;
498       }
499 
500       /* Build our matrix ... first determine the GPU storage type */
501       cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]);
502 
503       /* Create the streams and events (if desired).  */
504       PetscMPIInt    size;
505       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
506       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
507 
508       /* FILL MODE UPPER is irrelevant */
509       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
510 
511       /* lastly, build the matrix */
512       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
513       cusparseMat->mat->setCPRowIndices(ridx, m);
514       if (!a->compressedrow.use) {
515 	ierr = PetscFree(ii);CHKERRQ(ierr);
516 	ierr = PetscFree(ridx);CHKERRQ(ierr);
517       }
518       cusparseMat->tempvec = new CUSPARRAY;
519       cusparseMat->tempvec->resize(m);
520     } catch(char* ex) {
521       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
522     }
523     ierr = WaitForGPU();CHKERRCUSP(ierr);
524     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
525     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
526   }
527   PetscFunctionReturn(0);
528 }
529 
530 #undef __FUNCT__
531 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
532 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
533 {
534   PetscErrorCode ierr;
535 
536   PetscFunctionBegin;
537 
538   if (right) {
539     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
540     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
541     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
542     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
543     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
544   }
545   if (left) {
546     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
547     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
548     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
549     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
550     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
551   }
552   PetscFunctionReturn(0);
553 }
554 
555 #undef __FUNCT__
556 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
557 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
558 {
559   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
560   PetscErrorCode ierr;
561   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
562   CUSPARRAY      *xarray,*yarray;
563 
564   PetscFunctionBegin;
565   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
566      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
567   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
568   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
569   try {
570     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
571   } catch (char* ex) {
572     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
573   }
574   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
575   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
576   if (!cusparseMat->mat->hasNonZeroStream()) {
577     ierr = WaitForGPU();CHKERRCUSP(ierr);
578   }
579   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 
583 
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
586 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
587 {
588   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
589   PetscErrorCode ierr;
590   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
591   CUSPARRAY      *xarray,*yarray;
592 
593   PetscFunctionBegin;
594   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
595      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
596   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
597   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
598   try {
599 #if !defined(PETSC_USE_COMPLEX)
600     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
601 #else
602     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
603 #endif
604   } catch (char* ex) {
605     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
606   }
607   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
608   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
609   if (!cusparseMat->mat->hasNonZeroStream()) {
610     ierr = WaitForGPU();CHKERRCUSP(ierr);
611   }
612   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
613   PetscFunctionReturn(0);
614 }
615 
616 #undef __FUNCT__
617 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
618 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
619 {
620   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
621   PetscErrorCode ierr;
622   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
623   CUSPARRAY      *xarray,*yarray,*zarray;
624   PetscFunctionBegin;
625   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
626      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
627   try {
628     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
629     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
630     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
631     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
632 
633     /* multiply add */
634     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
635 
636     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
637     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
638     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
639 
640   } catch(char* ex) {
641     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
642   }
643   ierr = WaitForGPU();CHKERRCUSP(ierr);
644   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
645   PetscFunctionReturn(0);
646 }
647 
648 #undef __FUNCT__
649 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
650 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
651 {
652   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
653   PetscErrorCode ierr;
654   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
655   CUSPARRAY      *xarray,*yarray,*zarray;
656   PetscFunctionBegin;
657   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
658      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
659   try {
660     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
661     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
662     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
663     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
664 
665     /* multiply add with matrix transpose */
666 #if !defined(PETSC_USE_COMPLEX)
667     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
668 #else
669     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
670 #endif
671 
672     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
673     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
674     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
675 
676   } catch(char* ex) {
677     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
678   }
679   ierr = WaitForGPU();CHKERRCUSP(ierr);
680   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
681   PetscFunctionReturn(0);
682 }
683 
684 #undef __FUNCT__
685 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
686 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
687 {
688   PetscErrorCode  ierr;
689   PetscFunctionBegin;
690   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
691   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
692   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
693   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
694   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
695   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
696   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
697   PetscFunctionReturn(0);
698 }
699 
700 /* --------------------------------------------------------------------------------*/
701 #undef __FUNCT__
702 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
703 /*@
704    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
705    (the default parallel PETSc format). This matrix will ultimately pushed down
706    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
707    assembly performance the user should preallocate the matrix storage by setting
708    the parameter nz (or the array nnz).  By setting these parameters accurately,
709    performance during matrix assembly can be increased by more than a factor of 50.
710 
711    Collective on MPI_Comm
712 
713    Input Parameters:
714 +  comm - MPI communicator, set to PETSC_COMM_SELF
715 .  m - number of rows
716 .  n - number of columns
717 .  nz - number of nonzeros per row (same for all rows)
718 -  nnz - array containing the number of nonzeros in the various rows
719          (possibly different for each row) or PETSC_NULL
720 
721    Output Parameter:
722 .  A - the matrix
723 
724    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
725    MatXXXXSetPreallocation() paradgm instead of this routine directly.
726    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
727 
728    Notes:
729    If nnz is given then nz is ignored
730 
731    The AIJ format (also called the Yale sparse matrix format or
732    compressed row storage), is fully compatible with standard Fortran 77
733    storage.  That is, the stored row and column indices can begin at
734    either one (as in Fortran) or zero.  See the users' manual for details.
735 
736    Specify the preallocated storage with either nz or nnz (not both).
737    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
738    allocation.  For large problems you MUST preallocate memory or you
739    will get TERRIBLE performance, see the users' manual chapter on matrices.
740 
741    By default, this format uses inodes (identical nodes) when possible, to
742    improve numerical efficiency of matrix-vector products and solves. We
743    search for consecutive rows with the same nonzero structure, thereby
744    reusing matrix information to achieve increased efficiency.
745 
746    Level: intermediate
747 
748 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
749 @*/
750 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
751 {
752   PetscErrorCode ierr;
753 
754   PetscFunctionBegin;
755   ierr = MatCreate(comm,A);CHKERRQ(ierr);
756   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
757   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
758   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
759   PetscFunctionReturn(0);
760 }
761 
762 #undef __FUNCT__
763 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
764 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
765 {
766   PetscErrorCode        ierr;
767   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
768 
769   PetscFunctionBegin;
770   if (A->factortype==MAT_FACTOR_NONE) {
771     try {
772       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
773 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
774       }
775       if (cusparseMat->tempvec!=0)
776 	delete cusparseMat->tempvec;
777       delete cusparseMat;
778       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
779     } catch(char* ex) {
780       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
781     }
782   } else {
783     /* The triangular factors */
784     try {
785       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
786       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
787       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
788       delete (GPU_Matrix_Ifc *)cusparseMatLo;
789       delete (GPU_Matrix_Ifc *)cusparseMatUp;
790       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
791       delete cusparseTriFactors;
792     } catch(char* ex) {
793       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
794     }
795   }
796   if (MAT_cusparseHandle) {
797     cusparseStatus_t stat;
798     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
799     MAT_cusparseHandle=0;
800   }
801   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
802   A->spptr = 0;
803 
804   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
805   PetscFunctionReturn(0);
806 }
807 
808 EXTERN_C_BEGIN
809 #undef __FUNCT__
810 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
811 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
812 {
813   PetscErrorCode ierr;
814 
815   PetscFunctionBegin;
816   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
817   if (B->factortype==MAT_FACTOR_NONE) {
818     /* you cannot check the inode.use flag here since the matrix was just created.
819        now build a GPU matrix data structure */
820     B->spptr        = new Mat_SeqAIJCUSPARSE;
821     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
822     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
823     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR;
824   } else {
825     /* NEXT, set the pointers to the triangular factors */
826     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
827     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
828     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
829     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
830     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat;
831   }
832   /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */
833   if (!MAT_cusparseHandle) {
834     cusparseStatus_t stat;
835     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
836   }
837   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
838      default cusparse tri solve. Note the difference with the implementation in
839      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
840   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
841   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
842   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
843   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
844   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
845   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
846   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
847   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
848   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
849   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
850   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
851   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
852   PetscFunctionReturn(0);
853 }
854 EXTERN_C_END
855 
856 /*M
857    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
858 
859    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
860    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
861    the CUSPARSE library. This type is only available when using the 'txpetscgpu' package.
862    Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and
863    the different GPU storage formats.
864 
865    Options Database Keys:
866 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
867 .  -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
868 .  -mat_cusparse_mult_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
869 -  -mat_cusparse_solve_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package.
870 
871   Level: beginner
872 
873 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
874 M*/
875