xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision fdc842d1a43d22858d8041cc45d2f5b4ad073a0b)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
53d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
69ae82921SPaul Mullowney 
79ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
89ae82921SPaul Mullowney {
9bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
119ae82921SPaul Mullowney   PetscErrorCode     ierr;
129ae82921SPaul Mullowney   PetscInt           i;
139ae82921SPaul Mullowney 
149ae82921SPaul Mullowney   PetscFunctionBegin;
159ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
169ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
179ae82921SPaul Mullowney   if (d_nnz) {
189ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
199ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
209ae82921SPaul Mullowney     }
219ae82921SPaul Mullowney   }
229ae82921SPaul Mullowney   if (o_nnz) {
239ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
249ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
259ae82921SPaul Mullowney     }
269ae82921SPaul Mullowney   }
279ae82921SPaul Mullowney   if (!B->preallocated) {
28bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
299ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
30*fdc842d1SBarry Smith     ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr);
319ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
329ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
333bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
35*fdc842d1SBarry Smith     ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr);
369ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
379ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
383bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
399ae82921SPaul Mullowney   }
409ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
419ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
42e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
43e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
44b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
45b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
46b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
47b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
482205254eSKarl Rupp 
499ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
509ae82921SPaul Mullowney   PetscFunctionReturn(0);
519ae82921SPaul Mullowney }
52e057df02SPaul Mullowney 
539ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
549ae82921SPaul Mullowney {
55*fdc842d1SBarry Smith   /*
56*fdc842d1SBarry Smith      This multiplication sequence is different sequence
57e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
58e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
59e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
60e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
61e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
62e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
63e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
64e057df02SPaul Mullowney      against race conditions.
65*fdc842d1SBarry Smith   */
669ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
679ae82921SPaul Mullowney   PetscErrorCode ierr;
689ae82921SPaul Mullowney   PetscInt       nt;
699ae82921SPaul Mullowney 
709ae82921SPaul Mullowney   PetscFunctionBegin;
719ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
729ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
73959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
749ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
759ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
769ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
779ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
789ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
799ae82921SPaul Mullowney   PetscFunctionReturn(0);
809ae82921SPaul Mullowney }
81ca45077fSPaul Mullowney 
82*fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
83*fdc842d1SBarry Smith {
84*fdc842d1SBarry Smith   /*
85*fdc842d1SBarry Smith      This multiplication sequence is different sequence
86*fdc842d1SBarry Smith      than the CPU version. In particular, the diagonal block
87*fdc842d1SBarry Smith      multiplication kernel is launched in one stream. Then,
88*fdc842d1SBarry Smith      in a separate stream, the data transfers from DeviceToHost
89*fdc842d1SBarry Smith      (with MPI messaging in between), then HostToDevice are
90*fdc842d1SBarry Smith      launched. Once the data transfer stream is synchronized,
91*fdc842d1SBarry Smith      to ensure messaging is complete, the MatMultAdd kernel
92*fdc842d1SBarry Smith      is launched in the original (MatMult) stream to protect
93*fdc842d1SBarry Smith      against race conditions.
94*fdc842d1SBarry Smith   */
95*fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
96*fdc842d1SBarry Smith   PetscErrorCode ierr;
97*fdc842d1SBarry Smith   PetscInt       nt;
98*fdc842d1SBarry Smith 
99*fdc842d1SBarry Smith   PetscFunctionBegin;
100*fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
101*fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
102*fdc842d1SBarry Smith   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
103*fdc842d1SBarry Smith   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
104*fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
105*fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106*fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
107*fdc842d1SBarry Smith   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
108*fdc842d1SBarry Smith   PetscFunctionReturn(0);
109*fdc842d1SBarry Smith }
110*fdc842d1SBarry Smith 
111ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
112ca45077fSPaul Mullowney {
113e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
114e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
115e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
116e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
117e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
118e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
119e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
120e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
121e057df02SPaul Mullowney      against race conditions.
122e057df02SPaul Mullowney 
123e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
124ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
125ca45077fSPaul Mullowney   PetscErrorCode ierr;
126ca45077fSPaul Mullowney   PetscInt       nt;
127ca45077fSPaul Mullowney 
128ca45077fSPaul Mullowney   PetscFunctionBegin;
129ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
130ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
131959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
1329b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
133ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
1349b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1359b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
136ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
137ca45077fSPaul Mullowney   PetscFunctionReturn(0);
138ca45077fSPaul Mullowney }
1399ae82921SPaul Mullowney 
140e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
141ca45077fSPaul Mullowney {
142ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
143bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
144e057df02SPaul Mullowney 
145ca45077fSPaul Mullowney   PetscFunctionBegin;
146e057df02SPaul Mullowney   switch (op) {
147e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
148e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
149045c96e1SPaul Mullowney     break;
150e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
151e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
152045c96e1SPaul Mullowney     break;
153e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
154e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
155e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
156045c96e1SPaul Mullowney     break;
157e057df02SPaul Mullowney   default:
158e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
159045c96e1SPaul Mullowney   }
160ca45077fSPaul Mullowney   PetscFunctionReturn(0);
161ca45077fSPaul Mullowney }
162e057df02SPaul Mullowney 
1634416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
164e057df02SPaul Mullowney {
165e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
166e057df02SPaul Mullowney   PetscErrorCode           ierr;
167e057df02SPaul Mullowney   PetscBool                flg;
168a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
169a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1705fd66863SKarl Rupp 
171e057df02SPaul Mullowney   PetscFunctionBegin;
172e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
173e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
174e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
175a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
176e057df02SPaul Mullowney     if (flg) {
177e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
178e057df02SPaul Mullowney     }
179e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
180a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181e057df02SPaul Mullowney     if (flg) {
182e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
183e057df02SPaul Mullowney     }
184e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
185a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186e057df02SPaul Mullowney     if (flg) {
187e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
188e057df02SPaul Mullowney     }
189e057df02SPaul Mullowney   }
1900af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
191e057df02SPaul Mullowney   PetscFunctionReturn(0);
192e057df02SPaul Mullowney }
193e057df02SPaul Mullowney 
19434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
19534d6c7a5SJose E. Roman {
19634d6c7a5SJose E. Roman   PetscErrorCode ierr;
19734d6c7a5SJose E. Roman   Mat_MPIAIJ     *mpiaij;
19834d6c7a5SJose E. Roman 
19934d6c7a5SJose E. Roman   PetscFunctionBegin;
20034d6c7a5SJose E. Roman   mpiaij = (Mat_MPIAIJ*)A->data;
20134d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
20234d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
20334d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
20434d6c7a5SJose E. Roman   }
20534d6c7a5SJose E. Roman   PetscFunctionReturn(0);
20634d6c7a5SJose E. Roman }
20734d6c7a5SJose E. Roman 
208bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
209bbf3fe20SPaul Mullowney {
210bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
211bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
212bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
213b06137fdSPaul Mullowney   cudaError_t        err;
214b06137fdSPaul Mullowney   cusparseStatus_t   stat;
215bbf3fe20SPaul Mullowney 
216bbf3fe20SPaul Mullowney   PetscFunctionBegin;
217bbf3fe20SPaul Mullowney   try {
218b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
219b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
220c41cb2e2SAlejandro Lamas Daviña     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
221c41cb2e2SAlejandro Lamas Daviña     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
222bbf3fe20SPaul Mullowney     delete cusparseStruct;
223bbf3fe20SPaul Mullowney   } catch(char *ex) {
224bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
225bbf3fe20SPaul Mullowney   }
226bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
227bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
228bbf3fe20SPaul Mullowney }
229ca45077fSPaul Mullowney 
2308cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2319ae82921SPaul Mullowney {
2329ae82921SPaul Mullowney   PetscErrorCode     ierr;
233bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
234bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
235b06137fdSPaul Mullowney   cudaError_t        err;
236b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2379ae82921SPaul Mullowney 
2389ae82921SPaul Mullowney   PetscFunctionBegin;
239bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
240bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
24134136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
24234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
24334136279SStefano Zampini 
244bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
245bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2462205254eSKarl Rupp 
247bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
248e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
249e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
250c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
251c41cb2e2SAlejandro Lamas Daviña   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
2522205254eSKarl Rupp 
25334d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
254bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
255*fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
256bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
257bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
258bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2592205254eSKarl Rupp 
260bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
261bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2629ae82921SPaul Mullowney   PetscFunctionReturn(0);
2639ae82921SPaul Mullowney }
2649ae82921SPaul Mullowney 
2653f9c0db1SPaul Mullowney /*@
2663f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
267e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2683f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
269e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
270e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
271e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2729ae82921SPaul Mullowney 
273d083f849SBarry Smith    Collective
274e057df02SPaul Mullowney 
275e057df02SPaul Mullowney    Input Parameters:
276e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
277e057df02SPaul Mullowney .  m - number of rows
278e057df02SPaul Mullowney .  n - number of columns
279e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
280e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2810298fd71SBarry Smith          (possibly different for each row) or NULL
282e057df02SPaul Mullowney 
283e057df02SPaul Mullowney    Output Parameter:
284e057df02SPaul Mullowney .  A - the matrix
285e057df02SPaul Mullowney 
286e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
287e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
288e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
289e057df02SPaul Mullowney 
290e057df02SPaul Mullowney    Notes:
291e057df02SPaul Mullowney    If nnz is given then nz is ignored
292e057df02SPaul Mullowney 
293e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
294e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
295e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
296e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
297e057df02SPaul Mullowney 
298e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
2990298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
300e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
301e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
302e057df02SPaul Mullowney 
303e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
304e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
305e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
306e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
307e057df02SPaul Mullowney 
308e057df02SPaul Mullowney    Level: intermediate
309e057df02SPaul Mullowney 
310e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
311e057df02SPaul Mullowney @*/
3129ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3139ae82921SPaul Mullowney {
3149ae82921SPaul Mullowney   PetscErrorCode ierr;
3159ae82921SPaul Mullowney   PetscMPIInt    size;
3169ae82921SPaul Mullowney 
3179ae82921SPaul Mullowney   PetscFunctionBegin;
3189ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3199ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3209ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3219ae82921SPaul Mullowney   if (size > 1) {
3229ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3239ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3249ae82921SPaul Mullowney   } else {
3259ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3269ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3279ae82921SPaul Mullowney   }
3289ae82921SPaul Mullowney   PetscFunctionReturn(0);
3299ae82921SPaul Mullowney }
3309ae82921SPaul Mullowney 
3313ca39a21SBarry Smith /*MC
332e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
333e057df02SPaul Mullowney 
3342692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3352692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3362692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3379ae82921SPaul Mullowney 
3389ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3399ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3409ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3419ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3429ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3439ae82921SPaul Mullowney 
3449ae82921SPaul Mullowney    Options Database Keys:
345e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3468468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3478468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3488468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3499ae82921SPaul Mullowney 
3509ae82921SPaul Mullowney   Level: beginner
3519ae82921SPaul Mullowney 
3528468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3538468deeeSKarl Rupp M
3549ae82921SPaul Mullowney M*/
355