xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 9b2db2227e056d74e10acbb4c7c75cbd904ecd7e)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
53d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
69ae82921SPaul Mullowney 
79ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
89ae82921SPaul Mullowney {
9bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
119ae82921SPaul Mullowney   PetscErrorCode     ierr;
129ae82921SPaul Mullowney   PetscInt           i;
139ae82921SPaul Mullowney 
149ae82921SPaul Mullowney   PetscFunctionBegin;
159ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
169ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
179ae82921SPaul Mullowney   if (d_nnz) {
189ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
199ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
209ae82921SPaul Mullowney     }
219ae82921SPaul Mullowney   }
229ae82921SPaul Mullowney   if (o_nnz) {
239ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
249ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
259ae82921SPaul Mullowney     }
269ae82921SPaul Mullowney   }
279ae82921SPaul Mullowney   if (!B->preallocated) {
28bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
299ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
309ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
319ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
323bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
339ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
359ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
363bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
379ae82921SPaul Mullowney   }
389ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
399ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
40e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
41e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
42b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
43b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
44b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
45b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
462205254eSKarl Rupp 
479ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
489ae82921SPaul Mullowney   PetscFunctionReturn(0);
499ae82921SPaul Mullowney }
50e057df02SPaul Mullowney 
512a7a6963SBarry Smith PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
529ae82921SPaul Mullowney {
539ae82921SPaul Mullowney   PetscErrorCode ierr;
5433d57670SJed Brown   PetscInt rbs,cbs;
559ae82921SPaul Mullowney 
569ae82921SPaul Mullowney   PetscFunctionBegin;
5733d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
589ae82921SPaul Mullowney   if (right) {
59ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
609ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6133d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
62c41cb2e2SAlejandro Lamas Daviña     ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr);
63cbf1f8acSSatish Balay     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
649ae82921SPaul Mullowney   }
659ae82921SPaul Mullowney   if (left) {
66ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
679ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
6833d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
69c41cb2e2SAlejandro Lamas Daviña     ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr);
70cbf1f8acSSatish Balay     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
71cbf1f8acSSatish Balay 
72cbf1f8acSSatish Balay 
739ae82921SPaul Mullowney   }
749ae82921SPaul Mullowney   PetscFunctionReturn(0);
759ae82921SPaul Mullowney }
769ae82921SPaul Mullowney 
779ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
789ae82921SPaul Mullowney {
79e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
80e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
81e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
82e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
83e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
84e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
85e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
86e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
87e057df02SPaul Mullowney      against race conditions.
88e057df02SPaul Mullowney 
89e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
909ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
919ae82921SPaul Mullowney   PetscErrorCode ierr;
929ae82921SPaul Mullowney   PetscInt       nt;
939ae82921SPaul Mullowney 
949ae82921SPaul Mullowney   PetscFunctionBegin;
959ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
969ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
979ae82921SPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
989ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
999ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1009ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1019ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
1029ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
1039ae82921SPaul Mullowney   PetscFunctionReturn(0);
1049ae82921SPaul Mullowney }
105ca45077fSPaul Mullowney 
106ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
107ca45077fSPaul Mullowney {
108e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
109e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
110e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
111e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
112e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
113e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
114e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
115e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
116e057df02SPaul Mullowney      against race conditions.
117e057df02SPaul Mullowney 
118e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
119ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
120ca45077fSPaul Mullowney   PetscErrorCode ierr;
121ca45077fSPaul Mullowney   PetscInt       nt;
122ca45077fSPaul Mullowney 
123ca45077fSPaul Mullowney   PetscFunctionBegin;
124ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
125ca45077fSPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
126*9b2db222SKarl Rupp   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr);
127*9b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
128ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
129*9b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
130*9b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
131ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
132ca45077fSPaul Mullowney   PetscFunctionReturn(0);
133ca45077fSPaul Mullowney }
1349ae82921SPaul Mullowney 
135e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
136ca45077fSPaul Mullowney {
137ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
138bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
139e057df02SPaul Mullowney 
140ca45077fSPaul Mullowney   PetscFunctionBegin;
141e057df02SPaul Mullowney   switch (op) {
142e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
143e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
144045c96e1SPaul Mullowney     break;
145e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
146e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
147045c96e1SPaul Mullowney     break;
148e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
149e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
150e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
151045c96e1SPaul Mullowney     break;
152e057df02SPaul Mullowney   default:
153e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
154045c96e1SPaul Mullowney   }
155ca45077fSPaul Mullowney   PetscFunctionReturn(0);
156ca45077fSPaul Mullowney }
157e057df02SPaul Mullowney 
1584416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
159e057df02SPaul Mullowney {
160e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
161e057df02SPaul Mullowney   PetscErrorCode           ierr;
162e057df02SPaul Mullowney   PetscBool                flg;
163a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
164a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1655fd66863SKarl Rupp 
166e057df02SPaul Mullowney   PetscFunctionBegin;
167e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
168e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
169e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
170e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
171a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
172e057df02SPaul Mullowney     if (flg) {
173e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
174e057df02SPaul Mullowney     }
175e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
176a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
177e057df02SPaul Mullowney     if (flg) {
178e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
179e057df02SPaul Mullowney     }
180e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
181a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182e057df02SPaul Mullowney     if (flg) {
183e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
184e057df02SPaul Mullowney     }
185e057df02SPaul Mullowney   }
186e057df02SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
187e057df02SPaul Mullowney   PetscFunctionReturn(0);
188e057df02SPaul Mullowney }
189e057df02SPaul Mullowney 
19034d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
19134d6c7a5SJose E. Roman {
19234d6c7a5SJose E. Roman   PetscErrorCode ierr;
19334d6c7a5SJose E. Roman   Mat_MPIAIJ     *mpiaij;
19434d6c7a5SJose E. Roman 
19534d6c7a5SJose E. Roman   PetscFunctionBegin;
19634d6c7a5SJose E. Roman   mpiaij = (Mat_MPIAIJ*)A->data;
19734d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
19834d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
19934d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
20034d6c7a5SJose E. Roman   }
20134d6c7a5SJose E. Roman   PetscFunctionReturn(0);
20234d6c7a5SJose E. Roman }
20334d6c7a5SJose E. Roman 
204bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
205bbf3fe20SPaul Mullowney {
206bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
207bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
208bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
209b06137fdSPaul Mullowney   cudaError_t        err;
210b06137fdSPaul Mullowney   cusparseStatus_t   stat;
211bbf3fe20SPaul Mullowney 
212bbf3fe20SPaul Mullowney   PetscFunctionBegin;
213bbf3fe20SPaul Mullowney   try {
214b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
215b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
216c41cb2e2SAlejandro Lamas Daviña     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
217c41cb2e2SAlejandro Lamas Daviña     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
218bbf3fe20SPaul Mullowney     delete cusparseStruct;
219bbf3fe20SPaul Mullowney   } catch(char *ex) {
220bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
221bbf3fe20SPaul Mullowney   }
222bbf3fe20SPaul Mullowney   cusparseStruct = 0;
2232205254eSKarl Rupp 
224bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
225bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
226bbf3fe20SPaul Mullowney }
227ca45077fSPaul Mullowney 
2288cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2299ae82921SPaul Mullowney {
2309ae82921SPaul Mullowney   PetscErrorCode     ierr;
231bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
232bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
233b06137fdSPaul Mullowney   cudaError_t        err;
234b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2359ae82921SPaul Mullowney 
2369ae82921SPaul Mullowney   PetscFunctionBegin;
237bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
238bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
239bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
240bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2412205254eSKarl Rupp 
242bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
243e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
244e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
245c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
246c41cb2e2SAlejandro Lamas Daviña   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
2472205254eSKarl Rupp 
24834d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
2492a7a6963SBarry Smith   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
250bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
251bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
252bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
253bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2542205254eSKarl Rupp 
255bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
256bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2579ae82921SPaul Mullowney   PetscFunctionReturn(0);
2589ae82921SPaul Mullowney }
2599ae82921SPaul Mullowney 
2603f9c0db1SPaul Mullowney /*@
2613f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
262e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2633f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
264e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
265e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
266e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2679ae82921SPaul Mullowney 
268e057df02SPaul Mullowney    Collective on MPI_Comm
269e057df02SPaul Mullowney 
270e057df02SPaul Mullowney    Input Parameters:
271e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
272e057df02SPaul Mullowney .  m - number of rows
273e057df02SPaul Mullowney .  n - number of columns
274e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
275e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2760298fd71SBarry Smith          (possibly different for each row) or NULL
277e057df02SPaul Mullowney 
278e057df02SPaul Mullowney    Output Parameter:
279e057df02SPaul Mullowney .  A - the matrix
280e057df02SPaul Mullowney 
281e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
282e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
283e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
284e057df02SPaul Mullowney 
285e057df02SPaul Mullowney    Notes:
286e057df02SPaul Mullowney    If nnz is given then nz is ignored
287e057df02SPaul Mullowney 
288e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
289e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
290e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
291e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
292e057df02SPaul Mullowney 
293e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
2940298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
295e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
296e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
297e057df02SPaul Mullowney 
298e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
299e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
300e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
301e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
302e057df02SPaul Mullowney 
303e057df02SPaul Mullowney    Level: intermediate
304e057df02SPaul Mullowney 
305e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
306e057df02SPaul Mullowney @*/
3079ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3089ae82921SPaul Mullowney {
3099ae82921SPaul Mullowney   PetscErrorCode ierr;
3109ae82921SPaul Mullowney   PetscMPIInt    size;
3119ae82921SPaul Mullowney 
3129ae82921SPaul Mullowney   PetscFunctionBegin;
3139ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3149ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3159ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3169ae82921SPaul Mullowney   if (size > 1) {
3179ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3189ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3199ae82921SPaul Mullowney   } else {
3209ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3219ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3229ae82921SPaul Mullowney   }
3239ae82921SPaul Mullowney   PetscFunctionReturn(0);
3249ae82921SPaul Mullowney }
3259ae82921SPaul Mullowney 
3263ca39a21SBarry Smith /*MC
327e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
328e057df02SPaul Mullowney 
3292692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3302692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3312692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3329ae82921SPaul Mullowney 
3339ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3349ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3359ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3369ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3379ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3389ae82921SPaul Mullowney 
3399ae82921SPaul Mullowney    Options Database Keys:
340e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3418468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3428468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3438468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3449ae82921SPaul Mullowney 
3459ae82921SPaul Mullowney   Level: beginner
3469ae82921SPaul Mullowney 
3478468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3488468deeeSKarl Rupp M
3499ae82921SPaul Mullowney M*/
350