xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 8dc1d2a3a0b0219f67591c34b0f761797c23be35)
1 /*
2     Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format.
4 */
5 
6 #include "petscconf.h"
7 PETSC_CUDA_EXTERN_C_BEGIN
8 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
9 //#include "petscbt.h"
10 #include "../src/vec/vec/impls/dvecimpl.h"
11 #include "petsc-private/vecimpl.h"
12 PETSC_CUDA_EXTERN_C_END
13 #undef VecType
14 #include "cusparsematimpl.h"
15 
16 // this is such a hack ... but I don't know of another way to pass this variable
17 // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
18 //  SpMV. Essentially, I need to use the same stream variable in two different
19 //  data structures. I do this by creating a single instance of that stream
20 //  and reuse it.
21 cudaStream_t theBodyStream=0;
22 
23 EXTERN_C_BEGIN
24 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
26 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *);
27 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
30 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat);
31 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 EXTERN_C_END
36 
37 EXTERN_C_BEGIN
38 #undef __FUNCT__
39 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
40 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
41 {
42   PetscFunctionBegin;
43   *type = MATSOLVERCUSPARSE;
44   PetscFunctionReturn(0);
45 }
46 EXTERN_C_END
47 
48 EXTERN_C_BEGIN
49 extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
50 EXTERN_C_END
51 
52 
53 
54 /*MC
55   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices
56   on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp
57 5~
58   ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves
59 
60   Options Database Keys:
61 + -mat_mult_cusparse_storage_format <CSR>         - (choose one of) CSR, ELL, HYB
62 + -mat_solve_cusparse_storage_format <CSR>        - (choose one of) CSR, ELL, HYB
63 
64    Level: beginner
65 M*/
66 
67 EXTERN_C_BEGIN
68 #undef __FUNCT__
69 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
70 PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
71 {
72   PetscErrorCode     ierr;
73 
74   PetscFunctionBegin;
75   ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr);
76   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
77     ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
78     ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr);
79     ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
80     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
81     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
82   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
83   (*B)->factortype = ftype;
84   PetscFunctionReturn(0);
85 }
86 EXTERN_C_END
87 
88 #undef __FUNCT__
89 #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve"
90 PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format)
91 {
92   PetscErrorCode     ierr;
93   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
94   GPU_Matrix_Ifc* cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
95   GPU_Matrix_Ifc* cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
96   PetscFunctionBegin;
97   cusparseTriFactors->format = format;
98   if (cusparseMatLo && cusparseMatUp) {
99     // If data exists already delete
100     if (cusparseMatLo)
101       delete (GPU_Matrix_Ifc *)cusparseMatLo;
102     if (cusparseMatUp)
103       delete (GPU_Matrix_Ifc *)cusparseMatUp;
104 
105     // key data was deleted so reset the flag.
106     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
107 
108     // rebuild matrix
109     ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr);
110   }
111   PetscFunctionReturn(0);
112 }
113 
114 //EXTERN_C_BEGIN
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE"
117 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool  flg)
118 {
119   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
120   PetscErrorCode ierr;
121   PetscFunctionBegin;
122   ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
123   switch (op) {
124   case MAT_DIAGBLOCK_CSR:
125   case MAT_OFFDIAGBLOCK_CSR:
126   case MAT_CSR:
127     cusparseMat->format = CSR;
128     break;
129   case MAT_DIAGBLOCK_DIA:
130   case MAT_OFFDIAGBLOCK_DIA:
131   case MAT_DIA:
132     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type.");
133   case MAT_DIAGBLOCK_ELL:
134   case MAT_OFFDIAGBLOCK_ELL:
135   case MAT_ELL:
136     cusparseMat->format = ELL;
137     break;
138   case MAT_DIAGBLOCK_HYB:
139   case MAT_OFFDIAGBLOCK_HYB:
140   case MAT_HYB:
141     cusparseMat->format = HYB;
142     break;
143   default:
144     break;
145   }
146   PetscFunctionReturn(0);
147 }
148 //EXTERN_C_END
149 
150 //EXTERN_C_BEGIN
151 #undef __FUNCT__
152 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
153 PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
154 {
155   PetscErrorCode     ierr;
156   PetscInt       idx=0;
157   char * formats[] ={CSR,ELL,HYB};
158   MatOption format;
159   PetscBool      flg;
160   PetscFunctionBegin;
161   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr);
162   if (A->factortype==MAT_FACTOR_NONE) {
163     ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format",
164 			     "Set the storage format of (seq)aijcusparse gpu matrices for SpMV",
165 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
166     if (formats[idx] == CSR)
167       format=MAT_CSR;
168     else if (formats[idx] == ELL)
169       format=MAT_ELL;
170     else if (formats[idx] == HYB)
171       format=MAT_HYB;
172 
173     ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr);
174   }
175   else {
176     ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format",
177 			     "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve",
178 			     "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr);
179     ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr);
180   }
181   ierr = PetscOptionsEnd();CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 
184 }
185 //EXTERN_C_END
186 
187 #undef __FUNCT__
188 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
189 PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
190 {
191   PetscErrorCode     ierr;
192 
193   PetscFunctionBegin;
194   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
195   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
196   PetscFunctionReturn(0);
197 }
198 
199 #undef __FUNCT__
200 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
201 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
202 {
203   PetscErrorCode     ierr;
204 
205   PetscFunctionBegin;
206   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
207   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
208   PetscFunctionReturn(0);
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix"
213 PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
214 {
215   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
216   PetscInt          n = A->rmap->n;
217   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
218   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
219   cusparseStatus_t stat;
220   const PetscInt    *ai = a->i,*aj = a->j,*vi;
221   const MatScalar   *aa = a->a,*v;
222   PetscErrorCode     ierr;
223   PetscInt *AiLo, *AjLo;
224   PetscScalar *AALo;
225   PetscInt i,nz, nzLower, offset, rowOffset;
226 
227   PetscFunctionBegin;
228   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
229     try {
230       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
231       nzLower=n+ai[n]-ai[1];
232 
233       /* Allocate Space for the lower triangular matrix */
234       ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
235       ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
236       ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
237 
238       /* Fill the lower triangular matrix */
239       AiLo[0]=(PetscInt) 0;
240       AiLo[n]=nzLower;
241       AjLo[0]=(PetscInt) 0;
242       AALo[0]=(MatScalar) 1.0;
243       v    = aa;
244       vi   = aj;
245       offset=1;
246       rowOffset=1;
247       for (i=1; i<n; i++) {
248 	nz  = ai[i+1] - ai[i];
249 	// additional 1 for the term on the diagonal
250 	AiLo[i]=rowOffset;
251 	rowOffset+=nz+1;
252 
253 	ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
254 	ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
255 
256 	offset+=nz;
257 	AjLo[offset]=(PetscInt) i;
258 	AALo[offset]=(MatScalar) 1.0;
259 	offset+=1;
260 
261 	v  += nz;
262 	vi += nz;
263       }
264       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
265       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
266       ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr);
267       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
268       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat;
269       // free temporary data
270       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
271       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
272       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
273     } catch(char* ex) {
274       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
275     }
276   }
277   PetscFunctionReturn(0);
278 }
279 
280 #undef __FUNCT__
281 #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix"
282 PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
283 {
284   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
285   PetscInt          n = A->rmap->n;
286   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
287   GPU_Matrix_Ifc* cusparseMat  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
288   cusparseStatus_t stat;
289   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
290   const MatScalar   *aa = a->a,*v;
291   PetscInt *AiUp, *AjUp;
292   PetscScalar *AAUp;
293   PetscInt i,nz, nzUpper, offset;
294   PetscErrorCode     ierr;
295 
296   PetscFunctionBegin;
297 
298   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
299     try {
300       /* next, figure out the number of nonzeros in the upper triangular matrix. */
301       nzUpper = adiag[0]-adiag[n];
302 
303       /* Allocate Space for the upper triangular matrix */
304       ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
305       ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
306       ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
307 
308       /* Fill the upper triangular matrix */
309       AiUp[0]=(PetscInt) 0;
310       AiUp[n]=nzUpper;
311       offset = nzUpper;
312       for (i=n-1; i>=0; i--){
313 	v   = aa + adiag[i+1] + 1;
314 	vi  = aj + adiag[i+1] + 1;
315 
316 	// number of elements NOT on the diagonal
317 	nz = adiag[i] - adiag[i+1]-1;
318 
319 	// decrement the offset
320 	offset -= (nz+1);
321 
322 	// first, set the diagonal elements
323 	AjUp[offset] = (PetscInt) i;
324 	AAUp[offset] = 1./v[nz];
325 	AiUp[i] = AiUp[i+1] - (nz+1);
326 
327 	ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
328 	ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
329       }
330       cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format);
331       stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
332       ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr);
333       stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat);
334       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat;
335       // free temporary data
336       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
337       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
338       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
339     } catch(char* ex) {
340       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
341     }
342   }
343   PetscFunctionReturn(0);
344 }
345 
346 #undef __FUNCT__
347 #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU"
348 PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
349 {
350   PetscErrorCode     ierr;
351   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
352   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
353   IS               isrow = a->row,iscol = a->icol;
354   PetscBool        row_identity,col_identity;
355   const PetscInt   *r,*c;
356   PetscInt          n = A->rmap->n;
357 
358   PetscFunctionBegin;
359   ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr);
360   ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr);
361   cusparseTriFactors->tempvec = new CUSPARRAY;
362   cusparseTriFactors->tempvec->resize(n);
363 
364   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
365   //lower triangular indices
366   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
367   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
368   if (!row_identity)
369     ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr);
370   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
371 
372   //upper triangular indices
373   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
374   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
375   if (!col_identity)
376     ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr);
377   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 #undef __FUNCT__
382 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
383 PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
384 {
385   PetscErrorCode   ierr;
386   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
387   IS               isrow = b->row,iscol = b->col;
388   PetscBool        row_identity,col_identity;
389 
390   PetscFunctionBegin;
391   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
392   // determine which version of MatSolve needs to be used.
393   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
394   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
395   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
396   else                              B->ops->solve = MatSolve_SeqAIJCUSPARSE;
397 
398   // Not sure why this is necessary but somehow the function pointers got reset
399   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
400   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
401   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
402   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
403 
404   //if (!row_identity) printf("Row permutations exist!");
405   //if (!col_identity) printf("Col permutations exist!");
406 
407   // get the triangular factors
408   ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
409   PetscFunctionReturn(0);
410 }
411 
412 
413 
414 #undef __FUNCT__
415 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
416 PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
417 {
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
419   PetscErrorCode ierr;
420   CUSPARRAY      *xGPU, *bGPU;
421   cusparseStatus_t stat;
422   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
423   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
424   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
425   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
426 
427   PetscFunctionBegin;
428   // Get the GPU pointers
429   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
430   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
431 
432   // solve with reordering
433   ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr);
434   stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat);
435   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
436   ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr);
437 
438   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
439   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
440   ierr = WaitForGPU();CHKERRCUSP(ierr);
441   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
442   PetscFunctionReturn(0);
443 }
444 
445 
446 
447 
448 #undef __FUNCT__
449 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
450 PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
451 {
452   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
453   PetscErrorCode    ierr;
454   CUSPARRAY         *xGPU, *bGPU;
455   cusparseStatus_t stat;
456   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
457   GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
458   GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
459   CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec;
460 
461   PetscFunctionBegin;
462   // Get the GPU pointers
463   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
464   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
465 
466   // solve
467   stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat);
468   stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat);
469 
470   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
471   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
472   ierr = WaitForGPU();CHKERRCUSP(ierr);
473   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
474   PetscFunctionReturn(0);
475 }
476 
477 #undef __FUNCT__
478 #define __FUNCT__ "MatCUSPARSECopyToGPU"
479 PetscErrorCode MatCUSPARSECopyToGPU(Mat A)
480 {
481 
482   Mat_SeqAIJCUSPARSE *cusparseMat  = (Mat_SeqAIJCUSPARSE*)A->spptr;
483   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
484   PetscInt        m           = A->rmap->n,*ii,*ridx;
485   PetscErrorCode  ierr;
486 
487 
488   PetscFunctionBegin;
489   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
490     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
491     /*
492       It may be possible to reuse nonzero structure with new matrix values but
493       for simplicity and insured correctness we delete and build a new matrix on
494       the GPU. Likely a very small performance hit.
495     */
496     if (cusparseMat->mat){
497       try {
498 	delete cusparseMat->mat;
499 	if (cusparseMat->tempvec)
500 	  delete cusparseMat->tempvec;
501 
502       } catch(char* ex) {
503 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
504       }
505     }
506     try {
507       cusparseMat->nonzerorow=0;
508       for (int j = 0; j<m; j++)
509 	cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0);
510 
511       if (a->compressedrow.use) {
512 	m    = a->compressedrow.nrows;
513 	ii   = a->compressedrow.i;
514 	ridx = a->compressedrow.rindex;
515       } else {
516 	// Forcing compressed row on the GPU ... only relevant for CSR storage
517 	int k=0;
518 	ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
519 	ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
520 	ii[0]=0;
521 	for (int j = 0; j<m; j++) {
522 	  if ((a->i[j+1]-a->i[j])>0) {
523 	    ii[k] = a->i[j];
524 	    ridx[k]= j;
525 	    k++;
526 	  }
527 	}
528 	ii[cusparseMat->nonzerorow] = a->nz;
529 	m = cusparseMat->nonzerorow;
530       }
531 
532       // Build our matrix ... first determine the GPU storage type
533       cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format);
534 
535       // Create the streams and events (if desired).
536       PetscMPIInt    size;
537       ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
538       ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);
539 
540       // FILL MODE UPPER is irrelevant
541       cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
542 
543       // lastly, build the matrix
544       ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr);
545       cusparseMat->mat->setCPRowIndices(ridx, m);
546 	//      }
547       if (!a->compressedrow.use) {
548 	// free data
549 	ierr = PetscFree(ii);CHKERRQ(ierr);
550 	ierr = PetscFree(ridx);CHKERRQ(ierr);
551       }
552       cusparseMat->tempvec = new CUSPARRAY;
553       cusparseMat->tempvec->resize(m);
554       //A->spptr = cusparseMat;
555     } catch(char* ex) {
556       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
557     }
558     ierr = WaitForGPU();CHKERRCUSP(ierr);
559     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
560     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
561   }
562   PetscFunctionReturn(0);
563 }
564 
565 // #undef __FUNCT__
566 // #define __FUNCT__ "MatCUSPARSECopyFromGPU"
567 // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu)
568 // {
569 //   Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr;
570 //   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
571 //   PetscInt        m          = A->rmap->n;
572 //   PetscErrorCode  ierr;
573 
574 //   PetscFunctionBegin;
575 //   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
576 //     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
577 //       try {
578 //         cuspstruct->mat = Agpu;
579 //         if (a->compressedrow.use) {
580 //           //PetscInt *ii, *ridx;
581 //           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
582 //         } else {
583 //           PetscInt i;
584 
585 //           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
586 //           a->nz    = cuspstruct->mat->values.size();
587 //           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
588 //           A->preallocated = PETSC_TRUE;
589 //           // Copy ai, aj, aa
590 //           if (a->singlemalloc) {
591 //             if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
592 //           } else {
593 //             if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
594 //             if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
595 //             if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
596 //           }
597 //           ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr);
598 //           ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
599 //           a->singlemalloc = PETSC_TRUE;
600 //           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
601 //           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
602 //           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
603 //           // Setup row lengths
604 //           if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
605 //           ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr);
606 //           ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
607 //           for(i = 0; i < m; ++i) {
608 //             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
609 //           }
610 //           // a->diag?
611 //         }
612 //         cuspstruct->tempvec = new CUSPARRAY;
613 //         cuspstruct->tempvec->resize(m);
614 //       } catch(char *ex) {
615 //         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
616 //       }
617 //     }
618 //     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
619 //     ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
620 //     ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
621 //     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
622 //   } else {
623 //     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
624 //   }
625 //   PetscFunctionReturn(0);
626 // }
627 
628 #undef __FUNCT__
629 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
630 PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
631 {
632   PetscErrorCode ierr;
633 
634   PetscFunctionBegin;
635 
636   if (right) {
637     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
638     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
639     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
640     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
641     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
642   }
643   if (left) {
644     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
645     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
646     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
647     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
648     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
649   }
650   PetscFunctionReturn(0);
651 }
652 
653 #undef __FUNCT__
654 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
655 PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
656 {
657   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
658   PetscErrorCode ierr;
659   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
660   CUSPARRAY      *xarray,*yarray;
661 
662   PetscFunctionBegin;
663   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
664   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
665   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
666   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
667   try {
668     ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
669   } catch (char* ex) {
670     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
671   }
672   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
673   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
674   if (!cusparseMat->mat->hasNonZeroStream()) {
675     ierr = WaitForGPU();CHKERRCUSP(ierr);
676   }
677   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
678   PetscFunctionReturn(0);
679 }
680 
681 
682 #undef __FUNCT__
683 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
684 PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
685 {
686   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
687   PetscErrorCode ierr;
688   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
689   CUSPARRAY      *xarray,*yarray;
690 
691   PetscFunctionBegin;
692   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
693   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
694   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
695   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
696   try {
697 #if !defined(PETSC_USE_COMPLEX)
698     ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
699 #else
700     ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
701 #endif
702   } catch (char* ex) {
703     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
704   }
705   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
706   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
707   if (!cusparseMat->mat->hasNonZeroStream()) {
708     ierr = WaitForGPU();CHKERRCUSP(ierr);
709   }
710   ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr);
711   PetscFunctionReturn(0);
712 }
713 
714 #undef __FUNCT__
715 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
716 PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
717 {
718   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
719   PetscErrorCode ierr;
720   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
721   CUSPARRAY      *xarray,*yarray,*zarray;
722   PetscFunctionBegin;
723   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
724   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
725   try {
726     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
727     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
728     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
729     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
730 
731     // multiply add
732     ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
733 
734     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
735     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
736     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
737 
738   } catch(char* ex) {
739     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
740   }
741   ierr = WaitForGPU();CHKERRCUSP(ierr);
742   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
743   PetscFunctionReturn(0);
744 }
745 
746 #undef __FUNCT__
747 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
748 PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
749 {
750   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
751   PetscErrorCode ierr;
752   Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr;
753   CUSPARRAY      *xarray,*yarray,*zarray;
754   PetscFunctionBegin;
755   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
756   // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
757   try {
758     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
759     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
760     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
761     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
762 
763     // multiply add with matrix transpose
764 #if !defined(PETSC_USE_COMPLEX)
765     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr);
766 #else
767     ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr);
768 #endif
769 
770     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
771     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
772     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
773 
774   } catch(char* ex) {
775     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
776   }
777   ierr = WaitForGPU();CHKERRCUSP(ierr);
778   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
779   PetscFunctionReturn(0);
780 }
781 
782 #undef __FUNCT__
783 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
784 PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
785 {
786   PetscErrorCode  ierr;
787   PetscFunctionBegin;
788   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
789   ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr);
790   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
791   // this is not necessary since MatCUSPARSECopyToGPU has been called.
792   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
793   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
794   //}
795   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
796   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
797   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
798   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
799   PetscFunctionReturn(0);
800 }
801 
802 /* --------------------------------------------------------------------------------*/
803 #undef __FUNCT__
804 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
805 /*@C
806    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
807    (the default parallel PETSc format).  For good matrix assembly performance
808    the user should preallocate the matrix storage by setting the parameter nz
809    (or the array nnz).  By setting these parameters accurately, performance
810    during matrix assembly can be increased by more than a factor of 50.
811 
812    Collective on MPI_Comm
813 
814    Input Parameters:
815 +  comm - MPI communicator, set to PETSC_COMM_SELF
816 .  m - number of rows
817 .  n - number of columns
818 .  nz - number of nonzeros per row (same for all rows)
819 -  nnz - array containing the number of nonzeros in the various rows
820          (possibly different for each row) or PETSC_NULL
821 
822    Output Parameter:
823 .  A - the matrix
824 
825    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
826    MatXXXXSetPreallocation() paradgm instead of this routine directly.
827    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
828 
829    Notes:
830    If nnz is given then nz is ignored
831 
832    The AIJ format (also called the Yale sparse matrix format or
833    compressed row storage), is fully compatible with standard Fortran 77
834    storage.  That is, the stored row and column indices can begin at
835    either one (as in Fortran) or zero.  See the users' manual for details.
836 
837    Specify the preallocated storage with either nz or nnz (not both).
838    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
839    allocation.  For large problems you MUST preallocate memory or you
840    will get TERRIBLE performance, see the users' manual chapter on matrices.
841 
842    By default, this format uses inodes (identical nodes) when possible, to
843    improve numerical efficiency of matrix-vector products and solves. We
844    search for consecutive rows with the same nonzero structure, thereby
845    reusing matrix information to achieve increased efficiency.
846 
847    Level: intermediate
848 
849 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
850 
851 @*/
852 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
853 {
854   PetscErrorCode ierr;
855 
856   PetscFunctionBegin;
857   ierr = MatCreate(comm,A);CHKERRQ(ierr);
858   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
859   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
860   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
861   PetscFunctionReturn(0);
862 }
863 
864 #undef __FUNCT__
865 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
866 PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
867 {
868   PetscErrorCode        ierr;
869   Mat_SeqAIJCUSPARSE      *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr;
870 
871   PetscFunctionBegin;
872   if (A->factortype==MAT_FACTOR_NONE) {
873     // The regular matrices
874     try {
875       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
876 	delete (GPU_Matrix_Ifc *)(cusparseMat->mat);
877       }
878       if (cusparseMat->tempvec!=0)
879 	delete cusparseMat->tempvec;
880       delete cusparseMat;
881       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
882     } catch(char* ex) {
883       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
884     }
885   } else {
886     // The triangular factors
887     try {
888       Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors  = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
889       GPU_Matrix_Ifc *cusparseMatLo  = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr;
890       GPU_Matrix_Ifc *cusparseMatUp  = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr;
891       // destroy the matrix and the container
892       delete (GPU_Matrix_Ifc *)cusparseMatLo;
893       delete (GPU_Matrix_Ifc *)cusparseMatUp;
894       delete (CUSPARRAY*) cusparseTriFactors->tempvec;
895       delete cusparseTriFactors;
896     } catch(char* ex) {
897       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
898     }
899   }
900   if (MAT_cusparseHandle) {
901     cusparseStatus_t stat;
902     stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat);
903     MAT_cusparseHandle=0;
904   }
905   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
906   A->spptr = 0;
907 
908   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
909   PetscFunctionReturn(0);
910 }
911 
912 
913 EXTERN_C_BEGIN
914 #undef __FUNCT__
915 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
916 PetscErrorCode  MatCreate_SeqAIJCUSPARSE(Mat B)
917 {
918   PetscErrorCode ierr;
919   GPUStorageFormat format = CSR;
920 
921   PetscFunctionBegin;
922   ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
923   if (B->factortype==MAT_FACTOR_NONE) {
924     /* you cannot check the inode.use flag here since the matrix was just created.*/
925     // now build a GPU matrix data structure
926     B->spptr        = new Mat_SeqAIJCUSPARSE;
927     ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0;
928     ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0;
929     ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format;
930   } else {
931     /* NEXT, set the pointers to the triangular factors */
932     B->spptr        = new Mat_SeqAIJCUSPARSETriFactors;
933     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0;
934     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0;
935     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0;
936     ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format;
937   }
938   // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...)
939   if (!MAT_cusparseHandle) {
940     cusparseStatus_t stat;
941     stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat);
942   }
943   // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
944   // default cusparse tri solve. Note the difference with the implementation in
945   // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu
946   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
947   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
948   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
949   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
950   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
951   B->ops->setoption        = MatSetOption_SeqAIJCUSPARSE;
952   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
953   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
954   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
955   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
956   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
957   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
958   PetscFunctionReturn(0);
959 }
960 EXTERN_C_END
961 
962