xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ca45077f9abf8e7344aac749d49eef6cfd88f621)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 
16 // this is such a hack ... but I don't know of another way to pass this variable
17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18 //  SpMV. Essentially, I need to use the same stream variable in two different
19 //  data structures. I do this by creating a single instance of that stream
20 //  and reuse it.
21 cudaStream_t theBodyStream=0;
22 
23 EXTERN_C_BEGIN
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat);
31 EXTERN_C_END
32 
33 EXTERN_C_BEGIN
34 #undef __FUNCT__
35 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
36 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
37 {
38   PetscFunctionBegin;
39   *type = MATSOLVERCUSPARSE;
40   PetscFunctionReturn(0);
41 }
42 EXTERN_C_END
43 
44 EXTERN_C_BEGIN
45 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
46 EXTERN_C_END
47 
48 
49 
50 /*MC
51   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
52   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
53 
54   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves
55 
56   Options Database Keys:
57 + -mat_mult_cusparse_storage_format <CSR>         - (choose one of) CSR, ELL, HYB
58 + -mat_solve_cusparse_storage_format <CSR>        - (choose one of) CSR, ELL, HYB
59 
60    Level: beginner
61 M*/
62 
63 EXTERN_C_BEGIN
64 #undef __FUNCT__
65 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
66 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
67 {
68   PetscErrorCode     ierr;
69 
70   PetscFunctionBegin;
71   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
72   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
73     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
74     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
75     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
76     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
77     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
78   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
79   (*B)->factortype = ftype;
80   PetscFunctionReturn(0);
81 }
82 EXTERN_C_END
83 
84 #undef __FUNCT__
85 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve"
86 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format)
87 {
88   PetscErrorCode     ierr;
89   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
90   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
91   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
92   PetscFunctionBegin;
93   cusparseTriFactors->format = format;
94   if (cusparseMatLo && cusparseMatUp) {
95     // If data exists already delete
96     if (cusparseMatLo)
97       delete (GPU_Matrix_Ifc *)cusparseMatLo;
98     if (cusparseMatUp)
99       delete (GPU_Matrix_Ifc *)cusparseMatUp;
100 
101     // key data was deleted so reset the flag.
102     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
103 
104     // rebuild matrix
105     ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr);
106   }
107   PetscFunctionReturn(0);
108 }
109 
110 //EXTERN_C_BEGIN
111 #undef __FUNCT__
112 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE"
113 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool  flg)
114 {
115   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
116   PetscErrorCode ierr;
117   PetscFunctionBegin;
118   ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
119   switch (op) {
120   case DIAGBLOCK_MAT_CSR:
121   case OFFDIAGBLOCK_MAT_CSR:
122   case MAT_CSR:
123     //std::cout << "MatSetOption_SeqAIJCUSPARSE : CSR" << std::endl;
124     cusparseMat->format = CSR;
125     break;
126   case DIAGBLOCK_MAT_DIA:
127   case OFFDIAGBLOCK_MAT_DIA:
128   case MAT_DIA:
129     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
130   case DIAGBLOCK_MAT_ELL:
131   case OFFDIAGBLOCK_MAT_ELL:
132   case MAT_ELL:
133     //std::cout << "MatSetOption_SeqAIJCUSPARSE : ELL" << std::endl;
134     cusparseMat->format = ELL;
135     break;
136   case DIAGBLOCK_MAT_HYB:
137   case OFFDIAGBLOCK_MAT_HYB:
138   case MAT_HYB:
139     //std::cout << "MatSetOption_SeqAIJCUSPARSE : HYB" << std::endl;
140     cusparseMat->format = HYB;
141     break;
142   default:
143     break;
144   }
145   PetscFunctionReturn(0);
146 }
147 //EXTERN_C_END
148 
149 //EXTERN_C_BEGIN
150 #undef __FUNCT__
151 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
152 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
153 {
154   PetscErrorCode     ierr;
155   PetscInt       idx=0;
156   char * formats[] ={CSR,ELL,HYB};
157   MatOption format;
158   PetscBool      flg;
159   PetscFunctionBegin;
160   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr);
161   if (A->factortype==MAT_FACTOR_NONE) {
162     ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format",
163 			     "Set the storage format of (seq)aijcusparse gpu matrices for SpMV",
164 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
165     if (formats[idx] == CSR)
166       format=MAT_CSR;
167     else if (formats[idx] == ELL)
168       format=MAT_ELL;
169     else if (formats[idx] == HYB)
170       format=MAT_HYB;
171 
172     ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr);
173   }
174   else {
175     ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format",
176 			     "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve",
177 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
178     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr);
179   }
180   ierr = PetscOptionsEnd();CHKERRQ(ierr);
181   PetscFunctionReturn(0);
182 
183 }
184 //EXTERN_C_END
185 
186 #undef __FUNCT__
187 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
188 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
189 {
190   PetscErrorCode     ierr;
191 
192   PetscFunctionBegin;
193   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
194   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
195   PetscFunctionReturn(0);
196 }
197 
198 #undef __FUNCT__
199 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
200 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
201 {
202   PetscErrorCode     ierr;
203 
204   PetscFunctionBegin;
205   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
206   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
207   PetscFunctionReturn(0);
208 }
209 
210 #undef __FUNCT__
211 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix"
212 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
213 {
214   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
215   PetscInt          n = A->rmap->n;
216   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
217   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
218   cusparseStatus_t stat;
219   const PetscInt    *ai = a->i,*aj = a->j,*vi;
220   const MatScalar   *aa = a->a,*v;
221   PetscErrorCode     ierr;
222   PetscInt *AiLo, *AjLo;
223   PetscScalar *AALo;
224   PetscInt i,nz, nzLower, offset, rowOffset;
225 
226   PetscFunctionBegin;
227   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
228     try {
229       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
230       nzLower=n+ai[n]-ai[1];
231 
232       /* Allocate Space for the lower triangular matrix */
233       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
234       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
235       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
236 
237       /* Fill the lower triangular matrix */
238       AiLo[0]=(PetscInt) 0;
239       AiLo[n]=nzLower;
240       AjLo[0]=(PetscInt) 0;
241       AALo[0]=(MatScalar) 1.0;
242       v    = aa;
243       vi   = aj;
244       offset=1;
245       rowOffset=1;
246       for (i=1; i<n; i++) {
247 	nz  = ai[i+1] - ai[i];
248 	// additional 1 for the term on the diagonal
249 	AiLo[i]=rowOffset;
250 	rowOffset+=nz+1;
251 
252 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
253 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
254 
255 	offset+=nz;
256 	AjLo[offset]=(PetscInt) i;
257 	AALo[offset]=(MatScalar) 1.0;
258 	offset+=1;
259 
260 	v  += nz;
261 	vi += nz;
262       }
263       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
264       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
265       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
266       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
267       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
268       // free temporary data
269       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
270       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
271       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
272     } catch(char* ex) {
273       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
274     }
275   }
276   PetscFunctionReturn(0);
277 }
278 
279 #undef __FUNCT__
280 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix"
281 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
282 {
283   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
284   PetscInt          n = A->rmap->n;
285   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
286   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
287   cusparseStatus_t stat;
288   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
289   const MatScalar   *aa = a->a,*v;
290   PetscInt *AiUp, *AjUp;
291   PetscScalar *AAUp;
292   PetscInt i,nz, nzUpper, offset;
293   PetscErrorCode     ierr;
294 
295   PetscFunctionBegin;
296 
297   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
298     try {
299       /* next, figure out the number of nonzeros in the upper triangular matrix. */
300       nzUpper = adiag[0]-adiag[n];
301 
302       /* Allocate Space for the upper triangular matrix */
303       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
304       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
305       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
306 
307       /* Fill the upper triangular matrix */
308       AiUp[0]=(PetscInt) 0;
309       AiUp[n]=nzUpper;
310       offset = nzUpper;
311       for (i=n-1; i>=0; i--){
312 	v   = aa + adiag[i+1] + 1;
313 	vi  = aj + adiag[i+1] + 1;
314 
315 	// number of elements NOT on the diagonal
316 	nz = adiag[i] - adiag[i+1]-1;
317 
318 	// decrement the offset
319 	offset -= (nz+1);
320 
321 	// first, set the diagonal elements
322 	AjUp[offset] = (PetscInt) i;
323 	AAUp[offset] = 1./v[nz];
324 	AiUp[i] = AiUp[i+1] - (nz+1);
325 
326 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
327 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
328       }
329       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
330       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
331       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
332       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
333       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
334       // free temporary data
335       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
336       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
337       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
338     } catch(char* ex) {
339       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
340     }
341   }
342   PetscFunctionReturn(0);
343 }
344 
345 #undef __FUNCT__
346 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU"
347 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
348 {
349   PetscErrorCode     ierr;
350   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
351   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
352   IS               isrow = a->row,iscol = a->icol;
353   PetscBool        row_identity,col_identity;
354   const PetscInt   *r,*c;
355   PetscInt          n = A->rmap->n;
356 
357   PetscFunctionBegin;
358   ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
359   ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
360   cusparseTriFactors->tempvec = new CUSPARRAY;
361   cusparseTriFactors->tempvec->resize(n);
362 
363   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
364   //lower triangular indices
365   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
366   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
367   if (!row_identity)
368     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
369   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
370 
371   //upper triangular indices
372   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
373   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
374   if (!col_identity)
375     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
376   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 #undef __FUNCT__
381 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
382 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
383 {
384   PetscErrorCode   ierr;
385   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
386   IS               isrow = b->row,iscol = b->col;
387   PetscBool        row_identity,col_identity;
388 
389   PetscFunctionBegin;
390   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
391   // determine which version of MatSolve needs to be used.
392   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
393   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
394   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
395   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
396   //if (!row_identity) printf("Row permutations exist!");
397   //if (!col_identity) printf("Col permutations exist!");
398 
399   // get the triangular factors
400   ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
401   PetscFunctionReturn(0);
402 }
403 
404 
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
408 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
409 {
410   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
411   PetscErrorCode ierr;
412   CUSPARRAY      *xGPU, *bGPU;
413   cusparseStatus_t stat;
414   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
415   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
416   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
417   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
418 
419   PetscFunctionBegin;
420   // Get the GPU pointers
421   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
422   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
423 
424   // solve with reordering
425   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
426   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
427   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
428   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
429 
430   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
431   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
432   ierr = WaitForGPU();CHKERRCUSP(ierr);
433   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
434   PetscFunctionReturn(0);
435 }
436 
437 
438 
439 
440 #undef __FUNCT__
441 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
442 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
443 {
444   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
445   PetscErrorCode    ierr;
446   CUSPARRAY         *xGPU, *bGPU;
447   cusparseStatus_t stat;
448   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
449   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
450   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
451   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
452 
453   PetscFunctionBegin;
454   // Get the GPU pointers
455   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
456   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
457 
458   // solve
459   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
460   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
461 
462   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
463   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
464   ierr = WaitForGPU();CHKERRCUSP(ierr);
465   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
466   PetscFunctionReturn(0);
467 }
468 
469 #undef __FUNCT__
470 #define __FUNCT__ "MatCUSPARSECopyToGPU"
471 PetscErrorCode MatCUSPARSECopyToGPU(Mat A)
472 {
473 
474   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
475   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
476   PetscInt        m           = A->rmap->n,*ii,*ridx;
477   PetscErrorCode  ierr;
478 
479 
480   PetscFunctionBegin;
481   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
482     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
483     /*
484       It may be possible to reuse nonzero structure with new matrix values but
485       for simplicity and insured correctness we delete and build a new matrix on
486       the GPU. Likely a very small performance hit.
487     */
488     if (cusparseMat->mat){
489       try {
490 	delete cusparseMat->mat;
491 	if (cusparseMat->tempvec)
492 	  delete cusparseMat->tempvec;
493 
494       } catch(char* ex) {
495 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
496       }
497     }
498     try {
499       cusparseMat->nonzerorow=0;
500       for (int j = 0; j<m; j++)
501 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
502 
503       if (a->compressedrow.use) {
504 	m    = a->compressedrow.nrows;
505 	ii   = a->compressedrow.i;
506 	ridx = a->compressedrow.rindex;
507       } else {
508 	// Forcing compressed row on the GPU ... only relevant for CSR storage
509 	int k=0;
510 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
511 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
512 	ii[0]=0;
513 	for (int j = 0; j<m; j++) {
514 	  if ((a->i[j+1]-a->i[j])>0) {
515 	    ii[k] = a->i[j];
516 	    ridx[k]= j;
517 	    k++;
518 	  }
519 	}
520 	ii[cusparseMat->nonzerorow] = a->nz;
521 	m = cusparseMat->nonzerorow;
522       }
523 
524       // Build our matrix ... first determine the GPU storage type
525       cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format);
526 
527       // Create the streams and events (if desired).
528       PetscMPIInt    size;
529       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
530       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
531 
532       // FILL MODE UPPER is irrelevant
533       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
534 
535       // lastly, build the matrix
536       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
537       cusparseMat->mat->setCPRowIndices(ridx, m);
538 	//      }
539       if (!a->compressedrow.use) {
540 	// free data
541 	ierr = PetscFree(ii);CHKERRQ(ierr);
542 	ierr = PetscFree(ridx);CHKERRQ(ierr);
543       }
544       cusparseMat->tempvec = new CUSPARRAY;
545       cusparseMat->tempvec->resize(m);
546       //A->spptr = cusparseMat;
547     } catch(char* ex) {
548       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
549     }
550     ierr = WaitForGPU();CHKERRCUSP(ierr);
551     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
552     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
553   }
554   PetscFunctionReturn(0);
555 }
556 
557 // #undef __FUNCT__
558 // #define __FUNCT__ "MatCUSPARSECopyFromGPU"
559 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu)
560 // {
561 //   Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr;
562 //   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
563 //   PetscInt        m          = A->rmap->n;
564 //   PetscErrorCode  ierr;
565 
566 //   PetscFunctionBegin;
567 //   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
568 //     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
569 //       try {
570 //         cuspstruct->mat = Agpu;
571 //         if (a->compressedrow.use) {
572 //           //PetscInt *ii, *ridx;
573 //           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
574 //         } else {
575 //           PetscInt i;
576 
577 //           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
578 //           a->nz    = cuspstruct->mat->values.size();
579 //           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
580 //           A->preallocated = PETSC_TRUE;
581 //           // Copy ai, aj, aa
582 //           if (a->singlemalloc) {
583 //             if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
584 //           } else {
585 //             if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
586 //             if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
587 //             if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
588 //           }
589 //           ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
590 //           ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
591 //           a->singlemalloc = PETSC_TRUE;
592 //           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
593 //           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
594 //           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
595 //           // Setup row lengths
596 //           if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
597 //           ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
598 //           ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
599 //           for(i = 0; i < m; ++i) {
600 //             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
601 //           }
602 //           // a->diag?
603 //         }
604 //         cuspstruct->tempvec = new CUSPARRAY;
605 //         cuspstruct->tempvec->resize(m);
606 //       } catch(char *ex) {
607 //         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
608 //       }
609 //     }
610 //     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
611 //     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
612 //     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
613 //     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
614 //   } else {
615 //     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
616 //   }
617 //   PetscFunctionReturn(0);
618 // }
619 
620 #undef __FUNCT__
621 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
622 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
623 {
624   PetscErrorCode ierr;
625 
626   PetscFunctionBegin;
627 
628   if (right) {
629     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
630     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
631     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
632     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
633     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
634   }
635   if (left) {
636     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
637     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
638     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
639     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
640     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
641   }
642   PetscFunctionReturn(0);
643 }
644 
645 #undef __FUNCT__
646 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
647 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
648 {
649   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
650   PetscErrorCode ierr;
651   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
652   CUSPARRAY      *xarray,*yarray;
653 
654   PetscFunctionBegin;
655   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
656   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
657   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
658   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
659   try {
660     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
661   } catch (char* ex) {
662     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
663   }
664   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
665   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
666   if (!cusparseMat->mat->hasNonZeroStream()) {
667     ierr = WaitForGPU();CHKERRCUSP(ierr);
668   }
669   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
670   PetscFunctionReturn(0);
671 }
672 
673 
674 #undef __FUNCT__
675 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
676 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
677 {
678   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
679   PetscErrorCode ierr;
680   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
681   CUSPARRAY      *xarray,*yarray;
682 
683   PetscFunctionBegin;
684   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
685   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
686   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
687   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
688   try {
689 #if !defined(PETSC_USE_COMPLEX)
690     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
691 #else
692     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
693 #endif
694   } catch (char* ex) {
695     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
696   }
697   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
698   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
699   if (!cusparseMat->mat->hasNonZeroStream()) {
700     ierr = WaitForGPU();CHKERRCUSP(ierr);
701   }
702   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
703   PetscFunctionReturn(0);
704 }
705 
706 #undef __FUNCT__
707 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
708 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
709 {
710   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
711   PetscErrorCode ierr;
712   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
713   CUSPARRAY      *xarray,*yarray,*zarray;
714   PetscFunctionBegin;
715   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
716   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
717   try {
718     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
719     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
720     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
721     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
722 
723     // multiply add
724     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
725 
726     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
727     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
728     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
729 
730   } catch(char* ex) {
731     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
732   }
733   ierr = WaitForGPU();CHKERRCUSP(ierr);
734   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
735   PetscFunctionReturn(0);
736 }
737 
738 #undef __FUNCT__
739 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
740 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
741 {
742   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
743   PetscErrorCode ierr;
744   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
745   CUSPARRAY      *xarray,*yarray,*zarray;
746   PetscFunctionBegin;
747   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
748   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
749   try {
750     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
751     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
752     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
753     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
754 
755     // multiply add with matrix transpose
756 #if !defined(PETSC_USE_COMPLEX)
757     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
758 #else
759     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
760 #endif
761 
762     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
763     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
764     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
765 
766   } catch(char* ex) {
767     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
768   }
769   ierr = WaitForGPU();CHKERRCUSP(ierr);
770   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
771   PetscFunctionReturn(0);
772 }
773 
774 #undef __FUNCT__
775 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
776 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
777 {
778   PetscErrorCode  ierr;
779   PetscFunctionBegin;
780   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
781   ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
782   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
783   // this is not necessary since MatCUSPARSECopyToGPU has been called.
784   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
785   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
786   //}
787   PetscFunctionReturn(0);
788 }
789 
790 /* --------------------------------------------------------------------------------*/
791 #undef __FUNCT__
792 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
793 /*@C
794    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
795    (the default parallel PETSc format).  For good matrix assembly performance
796    the user should preallocate the matrix storage by setting the parameter nz
797    (or the array nnz).  By setting these parameters accurately, performance
798    during matrix assembly can be increased by more than a factor of 50.
799 
800    Collective on MPI_Comm
801 
802    Input Parameters:
803 +  comm - MPI communicator, set to PETSC_COMM_SELF
804 .  m - number of rows
805 .  n - number of columns
806 .  nz - number of nonzeros per row (same for all rows)
807 -  nnz - array containing the number of nonzeros in the various rows
808          (possibly different for each row) or PETSC_NULL
809 
810    Output Parameter:
811 .  A - the matrix
812 
813    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
814    MatXXXXSetPreallocation() paradgm instead of this routine directly.
815    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
816 
817    Notes:
818    If nnz is given then nz is ignored
819 
820    The AIJ format (also called the Yale sparse matrix format or
821    compressed row storage), is fully compatible with standard Fortran 77
822    storage.  That is, the stored row and column indices can begin at
823    either one (as in Fortran) or zero.  See the users' manual for details.
824 
825    Specify the preallocated storage with either nz or nnz (not both).
826    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
827    allocation.  For large problems you MUST preallocate memory or you
828    will get TERRIBLE performance, see the users' manual chapter on matrices.
829 
830    By default, this format uses inodes (identical nodes) when possible, to
831    improve numerical efficiency of matrix-vector products and solves. We
832    search for consecutive rows with the same nonzero structure, thereby
833    reusing matrix information to achieve increased efficiency.
834 
835    Level: intermediate
836 
837 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
838 
839 @*/
840 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
841 {
842   PetscErrorCode ierr;
843 
844   PetscFunctionBegin;
845   ierr = MatCreate(comm,A);CHKERRQ(ierr);
846   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
847   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
848   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
849   PetscFunctionReturn(0);
850 }
851 
852 #undef __FUNCT__
853 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
854 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
855 {
856   PetscErrorCode        ierr;
857   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
858 
859   PetscFunctionBegin;
860   if (A->factortype==MAT_FACTOR_NONE) {
861     // The regular matrices
862     try {
863       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
864 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
865       }
866       if (cusparseMat->tempvec!=0)
867 	delete cusparseMat->tempvec;
868       delete cusparseMat;
869       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
870     } catch(char* ex) {
871       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
872     }
873   } else {
874     // The triangular factors
875     try {
876       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
877       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
878       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
879       // destroy the matrix and the container
880       delete (GPU_Matrix_Ifc *)cusparseMatLo;
881       delete (GPU_Matrix_Ifc *)cusparseMatUp;
882       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
883       delete cusparseTriFactors;
884     } catch(char* ex) {
885       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
886     }
887   }
888   if (MAT_cusparseHandle) {
889     cusparseStatus_t stat;
890     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
891     MAT_cusparseHandle=0;
892   }
893   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
894   A->spptr = 0;
895 
896   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
897   PetscFunctionReturn(0);
898 }
899 
900 
901 EXTERN_C_BEGIN
902 #undef __FUNCT__
903 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
904 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
905 {
906   PetscErrorCode ierr;
907   GPUStorageFormat format = CSR;
908 
909   PetscFunctionBegin;
910   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
911   if (B->factortype==MAT_FACTOR_NONE) {
912     /* you cannot check the inode.use flag here since the matrix was just created.*/
913     // now build a GPU matrix data structure
914     B->spptr        = new Mat_SeqAIJCUSPARSE;
915     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
916     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
917     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format;
918   } else {
919     /* NEXT, set the pointers to the triangular factors */
920     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
921     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
922     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
923     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format;
924   }
925   // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...)
926   if (!MAT_cusparseHandle) {
927     cusparseStatus_t stat;
928     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
929   }
930   // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
931   // default cusparse tri solve. Note the difference with the implementation in
932   // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu
933   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
934   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
935   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
936   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
937   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
938   B->ops->setoption        = MatSetOption_SeqAIJCUSPARSE;
939   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
940   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
941   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
942   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
943   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
944   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
945   PetscFunctionReturn(0);
946 }
947 EXTERN_C_END
948 
949