xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 34d6c7a575bdb8fc9e2ffe999767d738671d28b3)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
53d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
69ae82921SPaul Mullowney 
7b06137fdSPaul Mullowney 
89ae82921SPaul Mullowney #undef __FUNCT__
99ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
109ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
119ae82921SPaul Mullowney {
12bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
149ae82921SPaul Mullowney   PetscErrorCode     ierr;
159ae82921SPaul Mullowney   PetscInt           i;
169ae82921SPaul Mullowney 
179ae82921SPaul Mullowney   PetscFunctionBegin;
189ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
199ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
209ae82921SPaul Mullowney   if (d_nnz) {
219ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
229ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239ae82921SPaul Mullowney     }
249ae82921SPaul Mullowney   }
259ae82921SPaul Mullowney   if (o_nnz) {
269ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
279ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
289ae82921SPaul Mullowney     }
299ae82921SPaul Mullowney   }
309ae82921SPaul Mullowney   if (!B->preallocated) {
31bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
329ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
339ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
353bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
369ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
379ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
389ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
393bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
409ae82921SPaul Mullowney   }
419ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
429ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
43e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
44e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
45b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
46b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
47b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
48b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
492205254eSKarl Rupp 
509ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
519ae82921SPaul Mullowney   PetscFunctionReturn(0);
529ae82921SPaul Mullowney }
53e057df02SPaul Mullowney 
549ae82921SPaul Mullowney #undef __FUNCT__
552a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE"
562a7a6963SBarry Smith PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
579ae82921SPaul Mullowney {
589ae82921SPaul Mullowney   PetscErrorCode ierr;
5933d57670SJed Brown   PetscInt rbs,cbs;
609ae82921SPaul Mullowney 
619ae82921SPaul Mullowney   PetscFunctionBegin;
6233d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
639ae82921SPaul Mullowney   if (right) {
64ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
659ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6633d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
67c41cb2e2SAlejandro Lamas Daviña     ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr);
68cbf1f8acSSatish Balay     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
699ae82921SPaul Mullowney   }
709ae82921SPaul Mullowney   if (left) {
71ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
729ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
7333d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
74c41cb2e2SAlejandro Lamas Daviña     ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr);
75cbf1f8acSSatish Balay     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
76cbf1f8acSSatish Balay 
77cbf1f8acSSatish Balay 
789ae82921SPaul Mullowney   }
799ae82921SPaul Mullowney   PetscFunctionReturn(0);
809ae82921SPaul Mullowney }
819ae82921SPaul Mullowney 
829ae82921SPaul Mullowney #undef __FUNCT__
839ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
849ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
859ae82921SPaul Mullowney {
86e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
87e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
88e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
89e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
90e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
91e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
92e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
93e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
94e057df02SPaul Mullowney      against race conditions.
95e057df02SPaul Mullowney 
96e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
979ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
989ae82921SPaul Mullowney   PetscErrorCode ierr;
999ae82921SPaul Mullowney   PetscInt       nt;
1009ae82921SPaul Mullowney 
1019ae82921SPaul Mullowney   PetscFunctionBegin;
1029ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1039ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
1049ae82921SPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
1059ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
1069ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1079ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1089ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
1099ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
1109ae82921SPaul Mullowney   PetscFunctionReturn(0);
1119ae82921SPaul Mullowney }
112ca45077fSPaul Mullowney 
113ca45077fSPaul Mullowney #undef __FUNCT__
114120f8b2dSAlejandro Lamas Daviña #define __FUNCT__ "MatMultTranspose_MPIAIJCUSPARSE"
115ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
116ca45077fSPaul Mullowney {
117e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
118e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
119e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
120e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
121e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
122e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
123e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
124e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
125e057df02SPaul Mullowney      against race conditions.
126e057df02SPaul Mullowney 
127e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
128ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
129ca45077fSPaul Mullowney   PetscErrorCode ierr;
130ca45077fSPaul Mullowney   PetscInt       nt;
131ca45077fSPaul Mullowney 
132ca45077fSPaul Mullowney   PetscFunctionBegin;
133ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
134ca45077fSPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
135ca45077fSPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
136ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
137ca45077fSPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
138ca45077fSPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
139ca45077fSPaul Mullowney   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
140ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
141ca45077fSPaul Mullowney   PetscFunctionReturn(0);
142ca45077fSPaul Mullowney }
1439ae82921SPaul Mullowney 
144ca45077fSPaul Mullowney #undef __FUNCT__
145e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
146e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
147ca45077fSPaul Mullowney {
148ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
149bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
150e057df02SPaul Mullowney 
151ca45077fSPaul Mullowney   PetscFunctionBegin;
152e057df02SPaul Mullowney   switch (op) {
153e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
154e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
155045c96e1SPaul Mullowney     break;
156e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
157e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
158045c96e1SPaul Mullowney     break;
159e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
160e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
161e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
162045c96e1SPaul Mullowney     break;
163e057df02SPaul Mullowney   default:
164e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
165045c96e1SPaul Mullowney   }
166ca45077fSPaul Mullowney   PetscFunctionReturn(0);
167ca45077fSPaul Mullowney }
168e057df02SPaul Mullowney 
169e057df02SPaul Mullowney #undef __FUNCT__
170e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
1714416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172e057df02SPaul Mullowney {
173e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
174e057df02SPaul Mullowney   PetscErrorCode           ierr;
175e057df02SPaul Mullowney   PetscBool                flg;
176a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
177a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1785fd66863SKarl Rupp 
179e057df02SPaul Mullowney   PetscFunctionBegin;
180e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
181e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
182e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
183e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
184a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185e057df02SPaul Mullowney     if (flg) {
186e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
187e057df02SPaul Mullowney     }
188e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
189a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190e057df02SPaul Mullowney     if (flg) {
191e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
192e057df02SPaul Mullowney     }
193e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
194a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
195e057df02SPaul Mullowney     if (flg) {
196e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
197e057df02SPaul Mullowney     }
198e057df02SPaul Mullowney   }
199e057df02SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
200e057df02SPaul Mullowney   PetscFunctionReturn(0);
201e057df02SPaul Mullowney }
202e057df02SPaul Mullowney 
203*34d6c7a5SJose E. Roman PETSC_EXTERN PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat,MatAssemblyType);
204*34d6c7a5SJose E. Roman 
205*34d6c7a5SJose E. Roman #undef __FUNCT__
206*34d6c7a5SJose E. Roman #define __FUNCT__ "MatAssemblyEnd_MPIAIJCUSPARSE"
207*34d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
208*34d6c7a5SJose E. Roman {
209*34d6c7a5SJose E. Roman   PetscErrorCode ierr;
210*34d6c7a5SJose E. Roman   Mat_MPIAIJ     *mpiaij;
211*34d6c7a5SJose E. Roman 
212*34d6c7a5SJose E. Roman   PetscFunctionBegin;
213*34d6c7a5SJose E. Roman   mpiaij = (Mat_MPIAIJ*)A->data;
214*34d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
215*34d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
216*34d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
217*34d6c7a5SJose E. Roman   }
218*34d6c7a5SJose E. Roman   PetscFunctionReturn(0);
219*34d6c7a5SJose E. Roman }
220*34d6c7a5SJose E. Roman 
221bbf3fe20SPaul Mullowney #undef __FUNCT__
222bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
223bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
224bbf3fe20SPaul Mullowney {
225bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
226bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
227bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
228b06137fdSPaul Mullowney   cudaError_t        err;
229b06137fdSPaul Mullowney   cusparseStatus_t   stat;
230bbf3fe20SPaul Mullowney 
231bbf3fe20SPaul Mullowney   PetscFunctionBegin;
232bbf3fe20SPaul Mullowney   try {
233b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
234b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
235c41cb2e2SAlejandro Lamas Daviña     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
236c41cb2e2SAlejandro Lamas Daviña     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
237bbf3fe20SPaul Mullowney     delete cusparseStruct;
238bbf3fe20SPaul Mullowney   } catch(char *ex) {
239bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
240bbf3fe20SPaul Mullowney   }
241bbf3fe20SPaul Mullowney   cusparseStruct = 0;
2422205254eSKarl Rupp 
243bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
244bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
245bbf3fe20SPaul Mullowney }
246ca45077fSPaul Mullowney 
2479ae82921SPaul Mullowney #undef __FUNCT__
2489ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
2498cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2509ae82921SPaul Mullowney {
2519ae82921SPaul Mullowney   PetscErrorCode     ierr;
252bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
253bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
254b06137fdSPaul Mullowney   cudaError_t        err;
255b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2569ae82921SPaul Mullowney 
2579ae82921SPaul Mullowney   PetscFunctionBegin;
258bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
259bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
260bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
261bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2622205254eSKarl Rupp 
263bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
264e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
265e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
266c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
267c41cb2e2SAlejandro Lamas Daviña   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
2682205254eSKarl Rupp 
269*34d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
2702a7a6963SBarry Smith   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
271bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
272bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
273bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
274bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2752205254eSKarl Rupp 
276bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
277bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2789ae82921SPaul Mullowney   PetscFunctionReturn(0);
2799ae82921SPaul Mullowney }
2809ae82921SPaul Mullowney 
2813f9c0db1SPaul Mullowney /*@
2823f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
283e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2843f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
285e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
286e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
287e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2889ae82921SPaul Mullowney 
289e057df02SPaul Mullowney    Collective on MPI_Comm
290e057df02SPaul Mullowney 
291e057df02SPaul Mullowney    Input Parameters:
292e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
293e057df02SPaul Mullowney .  m - number of rows
294e057df02SPaul Mullowney .  n - number of columns
295e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
296e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2970298fd71SBarry Smith          (possibly different for each row) or NULL
298e057df02SPaul Mullowney 
299e057df02SPaul Mullowney    Output Parameter:
300e057df02SPaul Mullowney .  A - the matrix
301e057df02SPaul Mullowney 
302e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
303e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
304e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
305e057df02SPaul Mullowney 
306e057df02SPaul Mullowney    Notes:
307e057df02SPaul Mullowney    If nnz is given then nz is ignored
308e057df02SPaul Mullowney 
309e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
310e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
311e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
312e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
313e057df02SPaul Mullowney 
314e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3150298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
316e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
317e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
318e057df02SPaul Mullowney 
319e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
320e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
321e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
322e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
323e057df02SPaul Mullowney 
324e057df02SPaul Mullowney    Level: intermediate
325e057df02SPaul Mullowney 
326e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
327e057df02SPaul Mullowney @*/
3289ae82921SPaul Mullowney #undef __FUNCT__
3299ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE"
3309ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3319ae82921SPaul Mullowney {
3329ae82921SPaul Mullowney   PetscErrorCode ierr;
3339ae82921SPaul Mullowney   PetscMPIInt    size;
3349ae82921SPaul Mullowney 
3359ae82921SPaul Mullowney   PetscFunctionBegin;
3369ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3379ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3389ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3399ae82921SPaul Mullowney   if (size > 1) {
3409ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3419ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3429ae82921SPaul Mullowney   } else {
3439ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3449ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3459ae82921SPaul Mullowney   }
3469ae82921SPaul Mullowney   PetscFunctionReturn(0);
3479ae82921SPaul Mullowney }
3489ae82921SPaul Mullowney 
3493f9c0db1SPaul Mullowney /*M
350e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
351e057df02SPaul Mullowney 
3522692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3532692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3542692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3559ae82921SPaul Mullowney 
3569ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3579ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3589ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3599ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3609ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3619ae82921SPaul Mullowney 
3629ae82921SPaul Mullowney    Options Database Keys:
363e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3648468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3658468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3668468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3679ae82921SPaul Mullowney 
3689ae82921SPaul Mullowney   Level: beginner
3699ae82921SPaul Mullowney 
3708468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3718468deeeSKarl Rupp M
3729ae82921SPaul Mullowney M*/
373