xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision dced61a5cfeeeda68dfa4ee3e53b34d3cfb8da9f)
10fd2b57fSKarl Rupp #define PETSC_SKIP_COMPLEX
2*dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
79ae82921SPaul Mullowney 
8b06137fdSPaul Mullowney 
99ae82921SPaul Mullowney #undef __FUNCT__
109ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
119ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
129ae82921SPaul Mullowney {
13bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
14bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
159ae82921SPaul Mullowney   PetscErrorCode     ierr;
169ae82921SPaul Mullowney   PetscInt           i;
179ae82921SPaul Mullowney 
189ae82921SPaul Mullowney   PetscFunctionBegin;
199ae82921SPaul Mullowney   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
209ae82921SPaul Mullowney   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
219ae82921SPaul Mullowney   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
229ae82921SPaul Mullowney   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
239ae82921SPaul Mullowney 
249ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
259ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
269ae82921SPaul Mullowney   if (d_nnz) {
279ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
289ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
299ae82921SPaul Mullowney     }
309ae82921SPaul Mullowney   }
319ae82921SPaul Mullowney   if (o_nnz) {
329ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
339ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
349ae82921SPaul Mullowney     }
359ae82921SPaul Mullowney   }
369ae82921SPaul Mullowney   if (!B->preallocated) {
37bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
389ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
399ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
409ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
413bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
429ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
439ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
449ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
453bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
469ae82921SPaul Mullowney   }
479ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
489ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
51b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
52b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
53b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
54b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
552205254eSKarl Rupp 
569ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
579ae82921SPaul Mullowney   PetscFunctionReturn(0);
589ae82921SPaul Mullowney }
59e057df02SPaul Mullowney 
609ae82921SPaul Mullowney #undef __FUNCT__
612a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE"
622a7a6963SBarry Smith PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
639ae82921SPaul Mullowney {
649ae82921SPaul Mullowney   PetscErrorCode ierr;
6533d57670SJed Brown   PetscInt rbs,cbs;
669ae82921SPaul Mullowney 
679ae82921SPaul Mullowney   PetscFunctionBegin;
6833d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
699ae82921SPaul Mullowney   if (right) {
70ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
719ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
7233d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
739ae82921SPaul Mullowney     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
74cbf1f8acSSatish Balay     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
759ae82921SPaul Mullowney   }
769ae82921SPaul Mullowney   if (left) {
77ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
789ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
7933d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
809ae82921SPaul Mullowney     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
81cbf1f8acSSatish Balay     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
82cbf1f8acSSatish Balay 
83cbf1f8acSSatish Balay 
849ae82921SPaul Mullowney   }
859ae82921SPaul Mullowney   PetscFunctionReturn(0);
869ae82921SPaul Mullowney }
879ae82921SPaul Mullowney 
889ae82921SPaul Mullowney 
899ae82921SPaul Mullowney #undef __FUNCT__
909ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
919ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
929ae82921SPaul Mullowney {
93e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
94e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
95e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
96e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
97e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
98e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
99e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
100e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
101e057df02SPaul Mullowney      against race conditions.
102e057df02SPaul Mullowney 
103e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
1049ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1059ae82921SPaul Mullowney   PetscErrorCode ierr;
1069ae82921SPaul Mullowney   PetscInt       nt;
1079ae82921SPaul Mullowney 
1089ae82921SPaul Mullowney   PetscFunctionBegin;
1099ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1109ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
1119ae82921SPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
1129ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
1139ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1149ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1159ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
1169ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
1179ae82921SPaul Mullowney   PetscFunctionReturn(0);
1189ae82921SPaul Mullowney }
119ca45077fSPaul Mullowney 
120ca45077fSPaul Mullowney #undef __FUNCT__
121ca45077fSPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
122ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
123ca45077fSPaul Mullowney {
124e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
125e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
126e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
127e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
128e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
129e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
130e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
131e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
132e057df02SPaul Mullowney      against race conditions.
133e057df02SPaul Mullowney 
134e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
135ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
136ca45077fSPaul Mullowney   PetscErrorCode ierr;
137ca45077fSPaul Mullowney   PetscInt       nt;
138ca45077fSPaul Mullowney 
139ca45077fSPaul Mullowney   PetscFunctionBegin;
140ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
141ca45077fSPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
142ca45077fSPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
143ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
144ca45077fSPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
145ca45077fSPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
146ca45077fSPaul Mullowney   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
147ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
148ca45077fSPaul Mullowney   PetscFunctionReturn(0);
149ca45077fSPaul Mullowney }
1509ae82921SPaul Mullowney 
151ca45077fSPaul Mullowney #undef __FUNCT__
152e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
153e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
154ca45077fSPaul Mullowney {
155ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
156bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
157e057df02SPaul Mullowney 
158ca45077fSPaul Mullowney   PetscFunctionBegin;
159e057df02SPaul Mullowney   switch (op) {
160e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
161e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
162045c96e1SPaul Mullowney     break;
163e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
164e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
165045c96e1SPaul Mullowney     break;
166e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
167e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
168e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
169045c96e1SPaul Mullowney     break;
170e057df02SPaul Mullowney   default:
171e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
172045c96e1SPaul Mullowney   }
173ca45077fSPaul Mullowney   PetscFunctionReturn(0);
174ca45077fSPaul Mullowney }
175e057df02SPaul Mullowney 
176e057df02SPaul Mullowney #undef __FUNCT__
177e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
1788c34d3f5SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
179e057df02SPaul Mullowney {
180e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
181e057df02SPaul Mullowney   PetscErrorCode           ierr;
182e057df02SPaul Mullowney   PetscBool                flg;
183a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
184a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1855fd66863SKarl Rupp 
186e057df02SPaul Mullowney   PetscFunctionBegin;
187e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
188e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
189e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
190e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
191a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
192e057df02SPaul Mullowney     if (flg) {
193e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
194e057df02SPaul Mullowney     }
195e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
196a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197e057df02SPaul Mullowney     if (flg) {
198e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
199e057df02SPaul Mullowney     }
200e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
201a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
202e057df02SPaul Mullowney     if (flg) {
203e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
204e057df02SPaul Mullowney     }
205e057df02SPaul Mullowney   }
206e057df02SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
207e057df02SPaul Mullowney   PetscFunctionReturn(0);
208e057df02SPaul Mullowney }
209e057df02SPaul Mullowney 
210bbf3fe20SPaul Mullowney #undef __FUNCT__
211bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
212bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
213bbf3fe20SPaul Mullowney {
214bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
215bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
216bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
217b06137fdSPaul Mullowney   cudaError_t        err;
218b06137fdSPaul Mullowney   cusparseStatus_t   stat;
219bbf3fe20SPaul Mullowney 
220bbf3fe20SPaul Mullowney   PetscFunctionBegin;
221bbf3fe20SPaul Mullowney   try {
222b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
223b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
224b06137fdSPaul Mullowney     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat);
225b06137fdSPaul Mullowney     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err);
226bbf3fe20SPaul Mullowney     delete cusparseStruct;
227bbf3fe20SPaul Mullowney   } catch(char *ex) {
228bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
229bbf3fe20SPaul Mullowney   }
230bbf3fe20SPaul Mullowney   cusparseStruct = 0;
2312205254eSKarl Rupp 
232bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
233bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
234bbf3fe20SPaul Mullowney }
235ca45077fSPaul Mullowney 
2369ae82921SPaul Mullowney #undef __FUNCT__
2379ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
2388cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2399ae82921SPaul Mullowney {
2409ae82921SPaul Mullowney   PetscErrorCode     ierr;
241bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
242bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
243b06137fdSPaul Mullowney   cudaError_t        err;
244b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2459ae82921SPaul Mullowney 
2469ae82921SPaul Mullowney   PetscFunctionBegin;
247bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
248bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
249bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
250bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2512205254eSKarl Rupp 
252bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
253e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
254e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
255b06137fdSPaul Mullowney   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat);
256b06137fdSPaul Mullowney   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err);
2572205254eSKarl Rupp 
2582a7a6963SBarry Smith   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
259bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
260bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
261bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
262bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2632205254eSKarl Rupp 
264bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
265bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2669ae82921SPaul Mullowney   PetscFunctionReturn(0);
2679ae82921SPaul Mullowney }
2689ae82921SPaul Mullowney 
2693f9c0db1SPaul Mullowney /*@
2703f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
271e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2723f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
273e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
274e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
275e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2769ae82921SPaul Mullowney 
277e057df02SPaul Mullowney    Collective on MPI_Comm
278e057df02SPaul Mullowney 
279e057df02SPaul Mullowney    Input Parameters:
280e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
281e057df02SPaul Mullowney .  m - number of rows
282e057df02SPaul Mullowney .  n - number of columns
283e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
284e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2850298fd71SBarry Smith          (possibly different for each row) or NULL
286e057df02SPaul Mullowney 
287e057df02SPaul Mullowney    Output Parameter:
288e057df02SPaul Mullowney .  A - the matrix
289e057df02SPaul Mullowney 
290e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
291e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
292e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
293e057df02SPaul Mullowney 
294e057df02SPaul Mullowney    Notes:
295e057df02SPaul Mullowney    If nnz is given then nz is ignored
296e057df02SPaul Mullowney 
297e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
298e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
299e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
300e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
301e057df02SPaul Mullowney 
302e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3030298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
304e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
305e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
306e057df02SPaul Mullowney 
307e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
308e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
309e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
310e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
311e057df02SPaul Mullowney 
312e057df02SPaul Mullowney    Level: intermediate
313e057df02SPaul Mullowney 
314e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
315e057df02SPaul Mullowney @*/
3169ae82921SPaul Mullowney #undef __FUNCT__
3179ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE"
3189ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3199ae82921SPaul Mullowney {
3209ae82921SPaul Mullowney   PetscErrorCode ierr;
3219ae82921SPaul Mullowney   PetscMPIInt    size;
3229ae82921SPaul Mullowney 
3239ae82921SPaul Mullowney   PetscFunctionBegin;
3249ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3259ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3269ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3279ae82921SPaul Mullowney   if (size > 1) {
3289ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3299ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3309ae82921SPaul Mullowney   } else {
3319ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3329ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3339ae82921SPaul Mullowney   }
3349ae82921SPaul Mullowney   PetscFunctionReturn(0);
3359ae82921SPaul Mullowney }
3369ae82921SPaul Mullowney 
3373f9c0db1SPaul Mullowney /*M
338e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
339e057df02SPaul Mullowney 
3402692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3412692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3422692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3439ae82921SPaul Mullowney 
3449ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3459ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3469ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3479ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3489ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3499ae82921SPaul Mullowney 
3509ae82921SPaul Mullowney    Options Database Keys:
351e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3528468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3538468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3548468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3559ae82921SPaul Mullowney 
3569ae82921SPaul Mullowney   Level: beginner
3579ae82921SPaul Mullowney 
3588468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3598468deeeSKarl Rupp M
3609ae82921SPaul Mullowney M*/
361