xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision bbf3fe20a13d25761406078d7bb366504465e38d)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 
16 // this is such a hack ... but I don't know of another way to pass this variable
17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18 //  SpMV. Essentially, I need to use the same stream variable in two different
19 //  data structures. I do this by creating a single instance of that stream
20 //  and reuse it.
21 cudaStream_t theBodyStream=0;
22 
23 EXTERN_C_BEGIN
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat);
31 EXTERN_C_END
32 
33 EXTERN_C_BEGIN
34 #undef __FUNCT__
35 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
36 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
37 {
38   PetscFunctionBegin;
39   *type = MATSOLVERCUSPARSE;
40   PetscFunctionReturn(0);
41 }
42 EXTERN_C_END
43 
44 EXTERN_C_BEGIN
45 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
46 EXTERN_C_END
47 
48 
49 
50 /*MC
51   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
52   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
53 
54   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves
55 
56   Options Database Keys:
57 + -mat_mult_cusparse_storage_format <CSR>         - (choose one of) CSR, ELL, HYB
58 + -mat_solve_cusparse_storage_format <CSR>        - (choose one of) CSR, ELL, HYB
59 
60    Level: beginner
61 M*/
62 
63 EXTERN_C_BEGIN
64 #undef __FUNCT__
65 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
66 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
67 {
68   PetscErrorCode     ierr;
69 
70   PetscFunctionBegin;
71   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
72   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
73     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
74     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
75     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
76     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
77     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
78   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
79   (*B)->factortype = ftype;
80   PetscFunctionReturn(0);
81 }
82 EXTERN_C_END
83 
84 #undef __FUNCT__
85 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve"
86 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format)
87 {
88   PetscErrorCode     ierr;
89   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
90   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
91   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
92   PetscFunctionBegin;
93   cusparseTriFactors->format = format;
94   if (cusparseMatLo && cusparseMatUp) {
95     // If data exists already delete
96     if (cusparseMatLo)
97       delete (GPU_Matrix_Ifc *)cusparseMatLo;
98     if (cusparseMatUp)
99       delete (GPU_Matrix_Ifc *)cusparseMatUp;
100 
101     // key data was deleted so reset the flag.
102     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
103 
104     // rebuild matrix
105     ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr);
106   }
107   PetscFunctionReturn(0);
108 }
109 
110 //EXTERN_C_BEGIN
111 #undef __FUNCT__
112 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE"
113 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool  flg)
114 {
115   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
116   PetscErrorCode ierr;
117   PetscFunctionBegin;
118   ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
119   switch (op) {
120   case MAT_DIAGBLOCK_CSR:
121   case MAT_OFFDIAGBLOCK_CSR:
122   case MAT_CSR:
123     cusparseMat->format = CSR;
124     break;
125   case MAT_DIAGBLOCK_DIA:
126   case MAT_OFFDIAGBLOCK_DIA:
127   case MAT_DIA:
128     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
129   case MAT_DIAGBLOCK_ELL:
130   case MAT_OFFDIAGBLOCK_ELL:
131   case MAT_ELL:
132     cusparseMat->format = ELL;
133     break;
134   case MAT_DIAGBLOCK_HYB:
135   case MAT_OFFDIAGBLOCK_HYB:
136   case MAT_HYB:
137     cusparseMat->format = HYB;
138     break;
139   default:
140     break;
141   }
142   PetscFunctionReturn(0);
143 }
144 //EXTERN_C_END
145 
146 //EXTERN_C_BEGIN
147 #undef __FUNCT__
148 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
149 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
150 {
151   PetscErrorCode     ierr;
152   PetscInt       idx=0;
153   char * formats[] ={CSR,ELL,HYB};
154   MatOption format;
155   PetscBool      flg;
156   PetscFunctionBegin;
157   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr);
158   if (A->factortype==MAT_FACTOR_NONE) {
159     ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format",
160 			     "Set the storage format of (seq)aijcusparse gpu matrices for SpMV",
161 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
162     if (formats[idx] == CSR)
163       format=MAT_CSR;
164     else if (formats[idx] == ELL)
165       format=MAT_ELL;
166     else if (formats[idx] == HYB)
167       format=MAT_HYB;
168 
169     ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr);
170   }
171   else {
172     ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format",
173 			     "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve",
174 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
175     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr);
176   }
177   ierr = PetscOptionsEnd();CHKERRQ(ierr);
178   PetscFunctionReturn(0);
179 
180 }
181 //EXTERN_C_END
182 
183 #undef __FUNCT__
184 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
185 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
186 {
187   PetscErrorCode     ierr;
188 
189   PetscFunctionBegin;
190   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
191   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
192   PetscFunctionReturn(0);
193 }
194 
195 #undef __FUNCT__
196 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
197 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode     ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 #undef __FUNCT__
208 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix"
209 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
210 {
211   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
212   PetscInt          n = A->rmap->n;
213   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
214   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
215   cusparseStatus_t stat;
216   const PetscInt    *ai = a->i,*aj = a->j,*vi;
217   const MatScalar   *aa = a->a,*v;
218   PetscErrorCode     ierr;
219   PetscInt *AiLo, *AjLo;
220   PetscScalar *AALo;
221   PetscInt i,nz, nzLower, offset, rowOffset;
222 
223   PetscFunctionBegin;
224   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
225     try {
226       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
227       nzLower=n+ai[n]-ai[1];
228 
229       /* Allocate Space for the lower triangular matrix */
230       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
231       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
232       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
233 
234       /* Fill the lower triangular matrix */
235       AiLo[0]=(PetscInt) 0;
236       AiLo[n]=nzLower;
237       AjLo[0]=(PetscInt) 0;
238       AALo[0]=(MatScalar) 1.0;
239       v    = aa;
240       vi   = aj;
241       offset=1;
242       rowOffset=1;
243       for (i=1; i<n; i++) {
244 	nz  = ai[i+1] - ai[i];
245 	// additional 1 for the term on the diagonal
246 	AiLo[i]=rowOffset;
247 	rowOffset+=nz+1;
248 
249 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
250 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
251 
252 	offset+=nz;
253 	AjLo[offset]=(PetscInt) i;
254 	AALo[offset]=(MatScalar) 1.0;
255 	offset+=1;
256 
257 	v  += nz;
258 	vi += nz;
259       }
260       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
261       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
262       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
263       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
264       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
265       // free temporary data
266       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
267       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
268       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
269     } catch(char* ex) {
270       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
271     }
272   }
273   PetscFunctionReturn(0);
274 }
275 
276 #undef __FUNCT__
277 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix"
278 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
279 {
280   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
281   PetscInt          n = A->rmap->n;
282   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
283   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
284   cusparseStatus_t stat;
285   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
286   const MatScalar   *aa = a->a,*v;
287   PetscInt *AiUp, *AjUp;
288   PetscScalar *AAUp;
289   PetscInt i,nz, nzUpper, offset;
290   PetscErrorCode     ierr;
291 
292   PetscFunctionBegin;
293 
294   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
295     try {
296       /* next, figure out the number of nonzeros in the upper triangular matrix. */
297       nzUpper = adiag[0]-adiag[n];
298 
299       /* Allocate Space for the upper triangular matrix */
300       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
301       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
302       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
303 
304       /* Fill the upper triangular matrix */
305       AiUp[0]=(PetscInt) 0;
306       AiUp[n]=nzUpper;
307       offset = nzUpper;
308       for (i=n-1; i>=0; i--){
309 	v   = aa + adiag[i+1] + 1;
310 	vi  = aj + adiag[i+1] + 1;
311 
312 	// number of elements NOT on the diagonal
313 	nz = adiag[i] - adiag[i+1]-1;
314 
315 	// decrement the offset
316 	offset -= (nz+1);
317 
318 	// first, set the diagonal elements
319 	AjUp[offset] = (PetscInt) i;
320 	AAUp[offset] = 1./v[nz];
321 	AiUp[i] = AiUp[i+1] - (nz+1);
322 
323 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
324 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
325       }
326       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
327       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
328       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
329       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
330       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
331       // free temporary data
332       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
333       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
334       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
335     } catch(char* ex) {
336       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
337     }
338   }
339   PetscFunctionReturn(0);
340 }
341 
342 #undef __FUNCT__
343 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU"
344 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
345 {
346   PetscErrorCode     ierr;
347   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
348   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
349   IS               isrow = a->row,iscol = a->icol;
350   PetscBool        row_identity,col_identity;
351   const PetscInt   *r,*c;
352   PetscInt          n = A->rmap->n;
353 
354   PetscFunctionBegin;
355   ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
356   ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
357   cusparseTriFactors->tempvec = new CUSPARRAY;
358   cusparseTriFactors->tempvec->resize(n);
359 
360   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
361   //lower triangular indices
362   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
363   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
364   if (!row_identity)
365     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
366   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
367 
368   //upper triangular indices
369   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
370   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
371   if (!col_identity)
372     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
373   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
374   PetscFunctionReturn(0);
375 }
376 
377 #undef __FUNCT__
378 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
379 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
380 {
381   PetscErrorCode   ierr;
382   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
383   IS               isrow = b->row,iscol = b->col;
384   PetscBool        row_identity,col_identity;
385 
386   PetscFunctionBegin;
387   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
388   // determine which version of MatSolve needs to be used.
389   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
390   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
391   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
392   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
393   //if (!row_identity) printf("Row permutations exist!");
394   //if (!col_identity) printf("Col permutations exist!");
395 
396   // get the triangular factors
397   ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
398   PetscFunctionReturn(0);
399 }
400 
401 
402 
403 #undef __FUNCT__
404 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
405 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
406 {
407   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
408   PetscErrorCode ierr;
409   CUSPARRAY      *xGPU, *bGPU;
410   cusparseStatus_t stat;
411   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
412   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
413   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
414   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
415 
416   PetscFunctionBegin;
417   // Get the GPU pointers
418   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
419   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
420 
421   // solve with reordering
422   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
423   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
424   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
425   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
426 
427   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
428   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
429   ierr = WaitForGPU();CHKERRCUSP(ierr);
430   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
431   PetscFunctionReturn(0);
432 }
433 
434 
435 
436 
437 #undef __FUNCT__
438 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
439 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
440 {
441   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
442   PetscErrorCode    ierr;
443   CUSPARRAY         *xGPU, *bGPU;
444   cusparseStatus_t stat;
445   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
447   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
448   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
449 
450   PetscFunctionBegin;
451   // Get the GPU pointers
452   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
453   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
454 
455   // solve
456   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
457   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
458 
459   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
460   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
461   ierr = WaitForGPU();CHKERRCUSP(ierr);
462   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
463   PetscFunctionReturn(0);
464 }
465 
466 #undef __FUNCT__
467 #define __FUNCT__ "MatCUSPARSECopyToGPU"
468 PetscErrorCode MatCUSPARSECopyToGPU(Mat A)
469 {
470 
471   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
472   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
473   PetscInt        m           = A->rmap->n,*ii,*ridx;
474   PetscErrorCode  ierr;
475 
476 
477   PetscFunctionBegin;
478   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
479     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
480     /*
481       It may be possible to reuse nonzero structure with new matrix values but
482       for simplicity and insured correctness we delete and build a new matrix on
483       the GPU. Likely a very small performance hit.
484     */
485     if (cusparseMat->mat){
486       try {
487 	delete cusparseMat->mat;
488 	if (cusparseMat->tempvec)
489 	  delete cusparseMat->tempvec;
490 
491       } catch(char* ex) {
492 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
493       }
494     }
495     try {
496       cusparseMat->nonzerorow=0;
497       for (int j = 0; j<m; j++)
498 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
499 
500       if (a->compressedrow.use) {
501 	m    = a->compressedrow.nrows;
502 	ii   = a->compressedrow.i;
503 	ridx = a->compressedrow.rindex;
504       } else {
505 	// Forcing compressed row on the GPU ... only relevant for CSR storage
506 	int k=0;
507 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
508 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
509 	ii[0]=0;
510 	for (int j = 0; j<m; j++) {
511 	  if ((a->i[j+1]-a->i[j])>0) {
512 	    ii[k] = a->i[j];
513 	    ridx[k]= j;
514 	    k++;
515 	  }
516 	}
517 	ii[cusparseMat->nonzerorow] = a->nz;
518 	m = cusparseMat->nonzerorow;
519       }
520 
521       // Build our matrix ... first determine the GPU storage type
522       cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format);
523 
524       // Create the streams and events (if desired).
525       PetscMPIInt    size;
526       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
527       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
528 
529       // FILL MODE UPPER is irrelevant
530       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
531 
532       // lastly, build the matrix
533       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
534       cusparseMat->mat->setCPRowIndices(ridx, m);
535 	//      }
536       if (!a->compressedrow.use) {
537 	// free data
538 	ierr = PetscFree(ii);CHKERRQ(ierr);
539 	ierr = PetscFree(ridx);CHKERRQ(ierr);
540       }
541       cusparseMat->tempvec = new CUSPARRAY;
542       cusparseMat->tempvec->resize(m);
543       //A->spptr = cusparseMat;
544     } catch(char* ex) {
545       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
546     }
547     ierr = WaitForGPU();CHKERRCUSP(ierr);
548     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
549     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
550   }
551   PetscFunctionReturn(0);
552 }
553 
554 // #undef __FUNCT__
555 // #define __FUNCT__ "MatCUSPARSECopyFromGPU"
556 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu)
557 // {
558 //   Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr;
559 //   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
560 //   PetscInt        m          = A->rmap->n;
561 //   PetscErrorCode  ierr;
562 
563 //   PetscFunctionBegin;
564 //   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
565 //     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
566 //       try {
567 //         cuspstruct->mat = Agpu;
568 //         if (a->compressedrow.use) {
569 //           //PetscInt *ii, *ridx;
570 //           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
571 //         } else {
572 //           PetscInt i;
573 
574 //           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
575 //           a->nz    = cuspstruct->mat->values.size();
576 //           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
577 //           A->preallocated = PETSC_TRUE;
578 //           // Copy ai, aj, aa
579 //           if (a->singlemalloc) {
580 //             if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
581 //           } else {
582 //             if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
583 //             if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
584 //             if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
585 //           }
586 //           ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
587 //           ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
588 //           a->singlemalloc = PETSC_TRUE;
589 //           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
590 //           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
591 //           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
592 //           // Setup row lengths
593 //           if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
594 //           ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
595 //           ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
596 //           for(i = 0; i < m; ++i) {
597 //             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
598 //           }
599 //           // a->diag?
600 //         }
601 //         cuspstruct->tempvec = new CUSPARRAY;
602 //         cuspstruct->tempvec->resize(m);
603 //       } catch(char *ex) {
604 //         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
605 //       }
606 //     }
607 //     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
608 //     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
609 //     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
610 //     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
611 //   } else {
612 //     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
613 //   }
614 //   PetscFunctionReturn(0);
615 // }
616 
617 #undef __FUNCT__
618 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
619 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
620 {
621   PetscErrorCode ierr;
622 
623   PetscFunctionBegin;
624 
625   if (right) {
626     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
627     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
628     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
629     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
630     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
631   }
632   if (left) {
633     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
634     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
635     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
636     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
637     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
638   }
639   PetscFunctionReturn(0);
640 }
641 
642 #undef __FUNCT__
643 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
644 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
645 {
646   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
647   PetscErrorCode ierr;
648   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
649   CUSPARRAY      *xarray,*yarray;
650 
651   PetscFunctionBegin;
652   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
653   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
654   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
655   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
656   try {
657     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
658   } catch (char* ex) {
659     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
660   }
661   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
662   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
663   if (!cusparseMat->mat->hasNonZeroStream()) {
664     ierr = WaitForGPU();CHKERRCUSP(ierr);
665   }
666   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
667   PetscFunctionReturn(0);
668 }
669 
670 
671 #undef __FUNCT__
672 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
673 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
674 {
675   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
676   PetscErrorCode ierr;
677   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
678   CUSPARRAY      *xarray,*yarray;
679 
680   PetscFunctionBegin;
681   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
682   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
683   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
684   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
685   try {
686 #if !defined(PETSC_USE_COMPLEX)
687     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
688 #else
689     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
690 #endif
691   } catch (char* ex) {
692     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
693   }
694   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
695   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
696   if (!cusparseMat->mat->hasNonZeroStream()) {
697     ierr = WaitForGPU();CHKERRCUSP(ierr);
698   }
699   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
700   PetscFunctionReturn(0);
701 }
702 
703 #undef __FUNCT__
704 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
705 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
706 {
707   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
708   PetscErrorCode ierr;
709   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
710   CUSPARRAY      *xarray,*yarray,*zarray;
711   PetscFunctionBegin;
712   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
713   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
714   try {
715     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
716     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
717     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
718     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
719 
720     // multiply add
721     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
722 
723     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
724     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
725     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
726 
727   } catch(char* ex) {
728     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
729   }
730   ierr = WaitForGPU();CHKERRCUSP(ierr);
731   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734 
735 #undef __FUNCT__
736 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
737 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
738 {
739   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
740   PetscErrorCode ierr;
741   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
742   CUSPARRAY      *xarray,*yarray,*zarray;
743   PetscFunctionBegin;
744   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
745   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
746   try {
747     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
748     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
749     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
750     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
751 
752     // multiply add with matrix transpose
753 #if !defined(PETSC_USE_COMPLEX)
754     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
755 #else
756     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
757 #endif
758 
759     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
760     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
761     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
762 
763   } catch(char* ex) {
764     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
765   }
766   ierr = WaitForGPU();CHKERRCUSP(ierr);
767   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
768   PetscFunctionReturn(0);
769 }
770 
771 #undef __FUNCT__
772 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
773 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
774 {
775   PetscErrorCode  ierr;
776   PetscFunctionBegin;
777   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
778   ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
779   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
780   // this is not necessary since MatCUSPARSECopyToGPU has been called.
781   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
782   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
783   //}
784   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
785   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
786   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
787   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
788   PetscFunctionReturn(0);
789 }
790 
791 /* --------------------------------------------------------------------------------*/
792 #undef __FUNCT__
793 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
794 /*@C
795    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
796    (the default parallel PETSc format).  For good matrix assembly performance
797    the user should preallocate the matrix storage by setting the parameter nz
798    (or the array nnz).  By setting these parameters accurately, performance
799    during matrix assembly can be increased by more than a factor of 50.
800 
801    Collective on MPI_Comm
802 
803    Input Parameters:
804 +  comm - MPI communicator, set to PETSC_COMM_SELF
805 .  m - number of rows
806 .  n - number of columns
807 .  nz - number of nonzeros per row (same for all rows)
808 -  nnz - array containing the number of nonzeros in the various rows
809          (possibly different for each row) or PETSC_NULL
810 
811    Output Parameter:
812 .  A - the matrix
813 
814    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
815    MatXXXXSetPreallocation() paradgm instead of this routine directly.
816    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
817 
818    Notes:
819    If nnz is given then nz is ignored
820 
821    The AIJ format (also called the Yale sparse matrix format or
822    compressed row storage), is fully compatible with standard Fortran 77
823    storage.  That is, the stored row and column indices can begin at
824    either one (as in Fortran) or zero.  See the users' manual for details.
825 
826    Specify the preallocated storage with either nz or nnz (not both).
827    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
828    allocation.  For large problems you MUST preallocate memory or you
829    will get TERRIBLE performance, see the users' manual chapter on matrices.
830 
831    By default, this format uses inodes (identical nodes) when possible, to
832    improve numerical efficiency of matrix-vector products and solves. We
833    search for consecutive rows with the same nonzero structure, thereby
834    reusing matrix information to achieve increased efficiency.
835 
836    Level: intermediate
837 
838 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
839 
840 @*/
841 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
842 {
843   PetscErrorCode ierr;
844 
845   PetscFunctionBegin;
846   ierr = MatCreate(comm,A);CHKERRQ(ierr);
847   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
848   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
849   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
850   PetscFunctionReturn(0);
851 }
852 
853 #undef __FUNCT__
854 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
855 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
856 {
857   PetscErrorCode        ierr;
858   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
859 
860   PetscFunctionBegin;
861   if (A->factortype==MAT_FACTOR_NONE) {
862     // The regular matrices
863     try {
864       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
865 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
866       }
867       if (cusparseMat->tempvec!=0)
868 	delete cusparseMat->tempvec;
869       delete cusparseMat;
870       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
871     } catch(char* ex) {
872       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
873     }
874   } else {
875     // The triangular factors
876     try {
877       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
878       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
879       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
880       // destroy the matrix and the container
881       delete (GPU_Matrix_Ifc *)cusparseMatLo;
882       delete (GPU_Matrix_Ifc *)cusparseMatUp;
883       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
884       delete cusparseTriFactors;
885     } catch(char* ex) {
886       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
887     }
888   }
889   if (MAT_cusparseHandle) {
890     cusparseStatus_t stat;
891     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
892     MAT_cusparseHandle=0;
893   }
894   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
895   A->spptr = 0;
896 
897   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
898   PetscFunctionReturn(0);
899 }
900 
901 
902 EXTERN_C_BEGIN
903 #undef __FUNCT__
904 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
905 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
906 {
907   PetscErrorCode ierr;
908   GPUStorageFormat format = CSR;
909 
910   PetscFunctionBegin;
911   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
912   if (B->factortype==MAT_FACTOR_NONE) {
913     /* you cannot check the inode.use flag here since the matrix was just created.*/
914     // now build a GPU matrix data structure
915     B->spptr        = new Mat_SeqAIJCUSPARSE;
916     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
917     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
918     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format;
919   } else {
920     /* NEXT, set the pointers to the triangular factors */
921     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
922     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
923     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
924     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format;
925   }
926   // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...)
927   if (!MAT_cusparseHandle) {
928     cusparseStatus_t stat;
929     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
930   }
931   // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
932   // default cusparse tri solve. Note the difference with the implementation in
933   // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu
934   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
935   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
936   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
937   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
938   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
939   B->ops->setoption        = MatSetOption_SeqAIJCUSPARSE;
940   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
941   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
942   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
943   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
944   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
945   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
946   PetscFunctionReturn(0);
947 }
948 EXTERN_C_END
949 
950