xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision b470e4b452d98f011eeb1f84a34fed34535fc656)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
99ae82921SPaul Mullowney 
109ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
119ae82921SPaul Mullowney {
12bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
149ae82921SPaul Mullowney   PetscErrorCode     ierr;
159ae82921SPaul Mullowney   PetscInt           i;
169ae82921SPaul Mullowney 
179ae82921SPaul Mullowney   PetscFunctionBegin;
189ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
199ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
209ae82921SPaul Mullowney   if (d_nnz) {
219ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
229ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239ae82921SPaul Mullowney     }
249ae82921SPaul Mullowney   }
259ae82921SPaul Mullowney   if (o_nnz) {
269ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
279ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
289ae82921SPaul Mullowney     }
299ae82921SPaul Mullowney   }
309ae82921SPaul Mullowney   if (!B->preallocated) {
31bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
329ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33*b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
359ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
363bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
379ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
38*b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
399ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
409ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
413bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
429ae82921SPaul Mullowney   }
439ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
449ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
45e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
46e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
47b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
48b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
49b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
50b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
512205254eSKarl Rupp 
529ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
539ae82921SPaul Mullowney   PetscFunctionReturn(0);
549ae82921SPaul Mullowney }
55e057df02SPaul Mullowney 
569ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
579ae82921SPaul Mullowney {
58fdc842d1SBarry Smith   /*
59fdc842d1SBarry Smith      This multiplication sequence is different sequence
60e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
61e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
62e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
63e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
64e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
65e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
66e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
67e057df02SPaul Mullowney      against race conditions.
68fdc842d1SBarry Smith   */
699ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
709ae82921SPaul Mullowney   PetscErrorCode ierr;
719ae82921SPaul Mullowney   PetscInt       nt;
729ae82921SPaul Mullowney 
739ae82921SPaul Mullowney   PetscFunctionBegin;
749ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
759ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
76959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
779ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
789ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
799ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
809ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
819ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
829ae82921SPaul Mullowney   PetscFunctionReturn(0);
839ae82921SPaul Mullowney }
84ca45077fSPaul Mullowney 
85fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
86fdc842d1SBarry Smith {
87fdc842d1SBarry Smith   /*
88fdc842d1SBarry Smith      This multiplication sequence is different sequence
89fdc842d1SBarry Smith      than the CPU version. In particular, the diagonal block
90fdc842d1SBarry Smith      multiplication kernel is launched in one stream. Then,
91fdc842d1SBarry Smith      in a separate stream, the data transfers from DeviceToHost
92fdc842d1SBarry Smith      (with MPI messaging in between), then HostToDevice are
93fdc842d1SBarry Smith      launched. Once the data transfer stream is synchronized,
94fdc842d1SBarry Smith      to ensure messaging is complete, the MatMultAdd kernel
95fdc842d1SBarry Smith      is launched in the original (MatMult) stream to protect
96fdc842d1SBarry Smith      against race conditions.
97fdc842d1SBarry Smith   */
98fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
99fdc842d1SBarry Smith   PetscErrorCode ierr;
100fdc842d1SBarry Smith   PetscInt       nt;
101fdc842d1SBarry Smith 
102fdc842d1SBarry Smith   PetscFunctionBegin;
103fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
104fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
105fdc842d1SBarry Smith   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
106fdc842d1SBarry Smith   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
107fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
108fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
109fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
110fdc842d1SBarry Smith   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
111fdc842d1SBarry Smith   PetscFunctionReturn(0);
112fdc842d1SBarry Smith }
113fdc842d1SBarry Smith 
114ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
115ca45077fSPaul Mullowney {
116e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
117e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
118e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
119e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
120e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
121e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
122e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
123e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
124e057df02SPaul Mullowney      against race conditions.
125e057df02SPaul Mullowney 
126e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
127ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
128ca45077fSPaul Mullowney   PetscErrorCode ierr;
129ca45077fSPaul Mullowney   PetscInt       nt;
130ca45077fSPaul Mullowney 
131ca45077fSPaul Mullowney   PetscFunctionBegin;
132ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
133ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
134a3fdcf43SKarl Rupp   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
1359b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
136ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
1379b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1389b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
139ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
140ca45077fSPaul Mullowney   PetscFunctionReturn(0);
141ca45077fSPaul Mullowney }
1429ae82921SPaul Mullowney 
143e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
144ca45077fSPaul Mullowney {
145ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
146bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
147e057df02SPaul Mullowney 
148ca45077fSPaul Mullowney   PetscFunctionBegin;
149e057df02SPaul Mullowney   switch (op) {
150e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
151e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
152045c96e1SPaul Mullowney     break;
153e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
154e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
155045c96e1SPaul Mullowney     break;
156e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
157e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
158e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
159045c96e1SPaul Mullowney     break;
160e057df02SPaul Mullowney   default:
161e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
162045c96e1SPaul Mullowney   }
163ca45077fSPaul Mullowney   PetscFunctionReturn(0);
164ca45077fSPaul Mullowney }
165e057df02SPaul Mullowney 
1664416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
167e057df02SPaul Mullowney {
168e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
169e057df02SPaul Mullowney   PetscErrorCode           ierr;
170e057df02SPaul Mullowney   PetscBool                flg;
171a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
172a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1735fd66863SKarl Rupp 
174e057df02SPaul Mullowney   PetscFunctionBegin;
175e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
176e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
177e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
178a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179e057df02SPaul Mullowney     if (flg) {
180e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
181e057df02SPaul Mullowney     }
182e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
183a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
184e057df02SPaul Mullowney     if (flg) {
185e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
186e057df02SPaul Mullowney     }
187e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
188a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189e057df02SPaul Mullowney     if (flg) {
190e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191e057df02SPaul Mullowney     }
192e057df02SPaul Mullowney   }
1930af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
194e057df02SPaul Mullowney   PetscFunctionReturn(0);
195e057df02SPaul Mullowney }
196e057df02SPaul Mullowney 
19734d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
19834d6c7a5SJose E. Roman {
19934d6c7a5SJose E. Roman   PetscErrorCode ierr;
20034d6c7a5SJose E. Roman   Mat_MPIAIJ     *mpiaij;
20134d6c7a5SJose E. Roman 
20234d6c7a5SJose E. Roman   PetscFunctionBegin;
20334d6c7a5SJose E. Roman   mpiaij = (Mat_MPIAIJ*)A->data;
20434d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
20534d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
20634d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
20734d6c7a5SJose E. Roman   }
20834d6c7a5SJose E. Roman   PetscFunctionReturn(0);
20934d6c7a5SJose E. Roman }
21034d6c7a5SJose E. Roman 
211bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
212bbf3fe20SPaul Mullowney {
213bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
214bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
215bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
216b06137fdSPaul Mullowney   cudaError_t        err;
217b06137fdSPaul Mullowney   cusparseStatus_t   stat;
218bbf3fe20SPaul Mullowney 
219bbf3fe20SPaul Mullowney   PetscFunctionBegin;
220bbf3fe20SPaul Mullowney   try {
221b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
222b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
22357d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
22417403302SKarl Rupp     if (cusparseStruct->stream) {
225c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
22617403302SKarl Rupp     }
227bbf3fe20SPaul Mullowney     delete cusparseStruct;
228bbf3fe20SPaul Mullowney   } catch(char *ex) {
229bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
230bbf3fe20SPaul Mullowney   }
231bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
232bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
233bbf3fe20SPaul Mullowney }
234ca45077fSPaul Mullowney 
2358cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2369ae82921SPaul Mullowney {
2379ae82921SPaul Mullowney   PetscErrorCode     ierr;
238bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
239bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
240b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2419ae82921SPaul Mullowney 
2429ae82921SPaul Mullowney   PetscFunctionBegin;
243bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
244bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
24534136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
24634136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
24734136279SStefano Zampini 
248bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
249bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2502205254eSKarl Rupp 
251bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
252e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
253e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
25417403302SKarl Rupp   cusparseStruct->stream              = 0;
25557d48284SJunchao Zhang   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
2562205254eSKarl Rupp 
25734d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
258bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
259fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
260bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
261bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
262bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2632205254eSKarl Rupp 
264bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
265bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2669ae82921SPaul Mullowney   PetscFunctionReturn(0);
2679ae82921SPaul Mullowney }
2689ae82921SPaul Mullowney 
2693f9c0db1SPaul Mullowney /*@
2703f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
271e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2723f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
273e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
274e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
275e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2769ae82921SPaul Mullowney 
277d083f849SBarry Smith    Collective
278e057df02SPaul Mullowney 
279e057df02SPaul Mullowney    Input Parameters:
280e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
281e057df02SPaul Mullowney .  m - number of rows
282e057df02SPaul Mullowney .  n - number of columns
283e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
284e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2850298fd71SBarry Smith          (possibly different for each row) or NULL
286e057df02SPaul Mullowney 
287e057df02SPaul Mullowney    Output Parameter:
288e057df02SPaul Mullowney .  A - the matrix
289e057df02SPaul Mullowney 
290e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
291e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
292e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
293e057df02SPaul Mullowney 
294e057df02SPaul Mullowney    Notes:
295e057df02SPaul Mullowney    If nnz is given then nz is ignored
296e057df02SPaul Mullowney 
297e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
298e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
299e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
300e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
301e057df02SPaul Mullowney 
302e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3030298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
304e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
305e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
306e057df02SPaul Mullowney 
307e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
308e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
309e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
310e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
311e057df02SPaul Mullowney 
312e057df02SPaul Mullowney    Level: intermediate
313e057df02SPaul Mullowney 
314e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
315e057df02SPaul Mullowney @*/
3169ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3179ae82921SPaul Mullowney {
3189ae82921SPaul Mullowney   PetscErrorCode ierr;
3199ae82921SPaul Mullowney   PetscMPIInt    size;
3209ae82921SPaul Mullowney 
3219ae82921SPaul Mullowney   PetscFunctionBegin;
3229ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3239ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3249ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3259ae82921SPaul Mullowney   if (size > 1) {
3269ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3279ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3289ae82921SPaul Mullowney   } else {
3299ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3309ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3319ae82921SPaul Mullowney   }
3329ae82921SPaul Mullowney   PetscFunctionReturn(0);
3339ae82921SPaul Mullowney }
3349ae82921SPaul Mullowney 
3353ca39a21SBarry Smith /*MC
336e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
337e057df02SPaul Mullowney 
3382692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3392692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3402692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3419ae82921SPaul Mullowney 
3429ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3439ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3449ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3459ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3469ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3479ae82921SPaul Mullowney 
3489ae82921SPaul Mullowney    Options Database Keys:
349e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3508468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3518468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3528468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3539ae82921SPaul Mullowney 
3549ae82921SPaul Mullowney   Level: beginner
3559ae82921SPaul Mullowney 
3568468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3578468deeeSKarl Rupp M
3589ae82921SPaul Mullowney M*/
359