mpi/mpicusparse/mpiaijcusparse.cu

dced61a5SBarry Smith#define PETSC_SKIP_SPINLOCK
0fd2b57fSKarl Rupp
3d13b8fdSMatthew G. Knepley#include <petscconf.h>
9ae82921SPaul Mullowney#include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
3d13b8fdSMatthew G. Knepley#include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
9ae82921SPaul Mullowney
9ae82921SPaul MullowneyPetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
9ae82921SPaul Mullowney{
bbf3fe20SPaul Mullowney  Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
bbf3fe20SPaul Mullowney  Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
9ae82921SPaul Mullowney  PetscErrorCode     ierr;
9ae82921SPaul Mullowney  PetscInt           i;
9ae82921SPaul Mullowney
9ae82921SPaul Mullowney  PetscFunctionBegin;
9ae82921SPaul Mullowney  ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
9ae82921SPaul Mullowney  if (d_nnz) {
9ae82921SPaul Mullowney    for (i=0; i<B->rmap->n; i++) {
9ae82921SPaul Mullowney      if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
9ae82921SPaul Mullowney    }
9ae82921SPaul Mullowney  }
9ae82921SPaul Mullowney  if (o_nnz) {
9ae82921SPaul Mullowney    for (i=0; i<B->rmap->n; i++) {
9ae82921SPaul Mullowney      if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
9ae82921SPaul Mullowney    }
9ae82921SPaul Mullowney  }
9ae82921SPaul Mullowney  if (!B->preallocated) {
bbf3fe20SPaul Mullowney    /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
9ae82921SPaul Mullowney    ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
*fdc842d1SBarry Smith    ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr);
9ae82921SPaul Mullowney    ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
9ae82921SPaul Mullowney    ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3bb1ff40SBarry Smith    ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
9ae82921SPaul Mullowney    ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
*fdc842d1SBarry Smith    ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr);
9ae82921SPaul Mullowney    ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
9ae82921SPaul Mullowney    ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3bb1ff40SBarry Smith    ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
9ae82921SPaul Mullowney  }
9ae82921SPaul Mullowney  ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
e057df02SPaul Mullowney  ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
e057df02SPaul Mullowney  ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
b06137fdSPaul Mullowney  ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
b06137fdSPaul Mullowney  ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
b06137fdSPaul Mullowney  ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
b06137fdSPaul Mullowney  ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
2205254eSKarl Rupp
9ae82921SPaul Mullowney  B->preallocated = PETSC_TRUE;
9ae82921SPaul Mullowney  PetscFunctionReturn(0);
9ae82921SPaul Mullowney}
e057df02SPaul Mullowney
9ae82921SPaul MullowneyPetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
9ae82921SPaul Mullowney{
*fdc842d1SBarry Smith  /*
*fdc842d1SBarry Smith     This multiplication sequence is different sequence
e057df02SPaul Mullowney     than the CPU version. In particular, the diagonal block
e057df02SPaul Mullowney     multiplication kernel is launched in one stream. Then,
e057df02SPaul Mullowney     in a separate stream, the data transfers from DeviceToHost
e057df02SPaul Mullowney     (with MPI messaging in between), then HostToDevice are
e057df02SPaul Mullowney     launched. Once the data transfer stream is synchronized,
e057df02SPaul Mullowney     to ensure messaging is complete, the MatMultAdd kernel
e057df02SPaul Mullowney     is launched in the original (MatMult) stream to protect
e057df02SPaul Mullowney     against race conditions.
*fdc842d1SBarry Smith  */
9ae82921SPaul Mullowney  Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
9ae82921SPaul Mullowney  PetscErrorCode ierr;
9ae82921SPaul Mullowney  PetscInt       nt;
9ae82921SPaul Mullowney
9ae82921SPaul Mullowney  PetscFunctionBegin;
9ae82921SPaul Mullowney  ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
9ae82921SPaul Mullowney  if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
959dcdf5SJunchao Zhang  ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
9ae82921SPaul Mullowney  PetscFunctionReturn(0);
9ae82921SPaul Mullowney}
ca45077fSPaul Mullowney
*fdc842d1SBarry SmithPetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
*fdc842d1SBarry Smith{
*fdc842d1SBarry Smith  /*
*fdc842d1SBarry Smith     This multiplication sequence is different sequence
*fdc842d1SBarry Smith     than the CPU version. In particular, the diagonal block
*fdc842d1SBarry Smith     multiplication kernel is launched in one stream. Then,
*fdc842d1SBarry Smith     in a separate stream, the data transfers from DeviceToHost
*fdc842d1SBarry Smith     (with MPI messaging in between), then HostToDevice are
*fdc842d1SBarry Smith     launched. Once the data transfer stream is synchronized,
*fdc842d1SBarry Smith     to ensure messaging is complete, the MatMultAdd kernel
*fdc842d1SBarry Smith     is launched in the original (MatMult) stream to protect
*fdc842d1SBarry Smith     against race conditions.
*fdc842d1SBarry Smith  */
*fdc842d1SBarry Smith  Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
*fdc842d1SBarry Smith  PetscErrorCode ierr;
*fdc842d1SBarry Smith  PetscInt       nt;
*fdc842d1SBarry Smith
*fdc842d1SBarry Smith  PetscFunctionBegin;
*fdc842d1SBarry Smith  ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
*fdc842d1SBarry Smith  if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
*fdc842d1SBarry Smith  ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
*fdc842d1SBarry Smith  ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
*fdc842d1SBarry Smith  ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
*fdc842d1SBarry Smith  ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
*fdc842d1SBarry Smith  ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
*fdc842d1SBarry Smith  ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
*fdc842d1SBarry Smith  PetscFunctionReturn(0);
*fdc842d1SBarry Smith}
*fdc842d1SBarry Smith
ca45077fSPaul MullowneyPetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
ca45077fSPaul Mullowney{
e057df02SPaul Mullowney  /* This multiplication sequence is different sequence
e057df02SPaul Mullowney     than the CPU version. In particular, the diagonal block
e057df02SPaul Mullowney     multiplication kernel is launched in one stream. Then,
e057df02SPaul Mullowney     in a separate stream, the data transfers from DeviceToHost
e057df02SPaul Mullowney     (with MPI messaging in between), then HostToDevice are
e057df02SPaul Mullowney     launched. Once the data transfer stream is synchronized,
e057df02SPaul Mullowney     to ensure messaging is complete, the MatMultAdd kernel
e057df02SPaul Mullowney     is launched in the original (MatMult) stream to protect
e057df02SPaul Mullowney     against race conditions.
e057df02SPaul Mullowney
e057df02SPaul Mullowney     This sequence should only be called for GPU computation. */
ca45077fSPaul Mullowney  Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
ca45077fSPaul Mullowney  PetscErrorCode ierr;
ca45077fSPaul Mullowney  PetscInt       nt;
ca45077fSPaul Mullowney
ca45077fSPaul Mullowney  PetscFunctionBegin;
ca45077fSPaul Mullowney  ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
ccf5f80bSJunchao Zhang  if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
959dcdf5SJunchao Zhang  ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
9b2db222SKarl Rupp  ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
ca45077fSPaul Mullowney  ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
9b2db222SKarl Rupp  ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
9b2db222SKarl Rupp  ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
ca45077fSPaul Mullowney  ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
ca45077fSPaul Mullowney  PetscFunctionReturn(0);
ca45077fSPaul Mullowney}
9ae82921SPaul Mullowney
e057df02SPaul MullowneyPetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
ca45077fSPaul Mullowney{
ca45077fSPaul Mullowney  Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
bbf3fe20SPaul Mullowney  Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
e057df02SPaul Mullowney
ca45077fSPaul Mullowney  PetscFunctionBegin;
e057df02SPaul Mullowney  switch (op) {
e057df02SPaul Mullowney  case MAT_CUSPARSE_MULT_DIAG:
e057df02SPaul Mullowney    cusparseStruct->diagGPUMatFormat = format;
045c96e1SPaul Mullowney    break;
e057df02SPaul Mullowney  case MAT_CUSPARSE_MULT_OFFDIAG:
e057df02SPaul Mullowney    cusparseStruct->offdiagGPUMatFormat = format;
045c96e1SPaul Mullowney    break;
e057df02SPaul Mullowney  case MAT_CUSPARSE_ALL:
e057df02SPaul Mullowney    cusparseStruct->diagGPUMatFormat    = format;
e057df02SPaul Mullowney    cusparseStruct->offdiagGPUMatFormat = format;
045c96e1SPaul Mullowney    break;
e057df02SPaul Mullowney  default:
e057df02SPaul Mullowney    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
045c96e1SPaul Mullowney  }
ca45077fSPaul Mullowney  PetscFunctionReturn(0);
ca45077fSPaul Mullowney}
e057df02SPaul Mullowney
4416b707SBarry SmithPetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
e057df02SPaul Mullowney{
e057df02SPaul Mullowney  MatCUSPARSEStorageFormat format;
e057df02SPaul Mullowney  PetscErrorCode           ierr;
e057df02SPaul Mullowney  PetscBool                flg;
a183c035SDominic Meiser  Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
a183c035SDominic Meiser  Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
5fd66863SKarl Rupp
e057df02SPaul Mullowney  PetscFunctionBegin;
e55864a3SBarry Smith  ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
e057df02SPaul Mullowney  if (A->factortype==MAT_FACTOR_NONE) {
e057df02SPaul Mullowney    ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
a183c035SDominic Meiser                            "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
e057df02SPaul Mullowney    if (flg) {
e057df02SPaul Mullowney      ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
e057df02SPaul Mullowney    }
e057df02SPaul Mullowney    ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
a183c035SDominic Meiser                            "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
e057df02SPaul Mullowney    if (flg) {
e057df02SPaul Mullowney      ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
e057df02SPaul Mullowney    }
e057df02SPaul Mullowney    ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
a183c035SDominic Meiser                            "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
e057df02SPaul Mullowney    if (flg) {
e057df02SPaul Mullowney      ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
e057df02SPaul Mullowney    }
e057df02SPaul Mullowney  }
0af67c1bSStefano Zampini  ierr = PetscOptionsTail();CHKERRQ(ierr);
e057df02SPaul Mullowney  PetscFunctionReturn(0);
e057df02SPaul Mullowney}
e057df02SPaul Mullowney
34d6c7a5SJose E. RomanPetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
34d6c7a5SJose E. Roman{
34d6c7a5SJose E. Roman  PetscErrorCode ierr;
34d6c7a5SJose E. Roman  Mat_MPIAIJ     *mpiaij;
34d6c7a5SJose E. Roman
34d6c7a5SJose E. Roman  PetscFunctionBegin;
34d6c7a5SJose E. Roman  mpiaij = (Mat_MPIAIJ*)A->data;
34d6c7a5SJose E. Roman  ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
34d6c7a5SJose E. Roman  if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
34d6c7a5SJose E. Roman    ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
34d6c7a5SJose E. Roman  }
34d6c7a5SJose E. Roman  PetscFunctionReturn(0);
34d6c7a5SJose E. Roman}
34d6c7a5SJose E. Roman
bbf3fe20SPaul MullowneyPetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
bbf3fe20SPaul Mullowney{
bbf3fe20SPaul Mullowney  PetscErrorCode     ierr;
bbf3fe20SPaul Mullowney  Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
bbf3fe20SPaul Mullowney  Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
b06137fdSPaul Mullowney  cudaError_t        err;
b06137fdSPaul Mullowney  cusparseStatus_t   stat;
bbf3fe20SPaul Mullowney
bbf3fe20SPaul Mullowney  PetscFunctionBegin;
bbf3fe20SPaul Mullowney  try {
b06137fdSPaul Mullowney    ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
b06137fdSPaul Mullowney    ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
c41cb2e2SAlejandro Lamas Daviña    stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
c41cb2e2SAlejandro Lamas Daviña    err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
bbf3fe20SPaul Mullowney    delete cusparseStruct;
bbf3fe20SPaul Mullowney  } catch(char *ex) {
bbf3fe20SPaul Mullowney    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
bbf3fe20SPaul Mullowney  }
bbf3fe20SPaul Mullowney  ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
bbf3fe20SPaul Mullowney  PetscFunctionReturn(0);
bbf3fe20SPaul Mullowney}
ca45077fSPaul Mullowney
8cc058d9SJed BrownPETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
9ae82921SPaul Mullowney{
9ae82921SPaul Mullowney  PetscErrorCode     ierr;
bbf3fe20SPaul Mullowney  Mat_MPIAIJ         *a;
bbf3fe20SPaul Mullowney  Mat_MPIAIJCUSPARSE * cusparseStruct;
b06137fdSPaul Mullowney  cudaError_t        err;
b06137fdSPaul Mullowney  cusparseStatus_t   stat;
9ae82921SPaul Mullowney
9ae82921SPaul Mullowney  PetscFunctionBegin;
bbf3fe20SPaul Mullowney  ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
bdf89e91SBarry Smith  ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
34136279SStefano Zampini  ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
34136279SStefano Zampini  ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
34136279SStefano Zampini
bbf3fe20SPaul Mullowney  a        = (Mat_MPIAIJ*)A->data;
bbf3fe20SPaul Mullowney  a->spptr = new Mat_MPIAIJCUSPARSE;
2205254eSKarl Rupp
bbf3fe20SPaul Mullowney  cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
e057df02SPaul Mullowney  cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
e057df02SPaul Mullowney  cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
c41cb2e2SAlejandro Lamas Daviña  stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
c41cb2e2SAlejandro Lamas Daviña  err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
2205254eSKarl Rupp
34d6c7a5SJose E. Roman  A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
bbf3fe20SPaul Mullowney  A->ops->mult           = MatMult_MPIAIJCUSPARSE;
*fdc842d1SBarry Smith  A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
bbf3fe20SPaul Mullowney  A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
bbf3fe20SPaul Mullowney  A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
bbf3fe20SPaul Mullowney  A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2205254eSKarl Rupp
bbf3fe20SPaul Mullowney  ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
bdf89e91SBarry Smith  ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
9ae82921SPaul Mullowney  PetscFunctionReturn(0);
9ae82921SPaul Mullowney}
9ae82921SPaul Mullowney
3f9c0db1SPaul Mullowney/*@
3f9c0db1SPaul Mullowney   MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
e057df02SPaul Mullowney   (the default parallel PETSc format).  This matrix will ultimately pushed down
3f9c0db1SPaul Mullowney   to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
e057df02SPaul Mullowney   assembly performance the user should preallocate the matrix storage by setting
e057df02SPaul Mullowney   the parameter nz (or the array nnz).  By setting these parameters accurately,
e057df02SPaul Mullowney   performance during matrix assembly can be increased by more than a factor of 50.
9ae82921SPaul Mullowney
d083f849SBarry Smith   Collective
e057df02SPaul Mullowney
e057df02SPaul Mullowney   Input Parameters:
e057df02SPaul Mullowney+  comm - MPI communicator, set to PETSC_COMM_SELF
e057df02SPaul Mullowney.  m - number of rows
e057df02SPaul Mullowney.  n - number of columns
e057df02SPaul Mullowney.  nz - number of nonzeros per row (same for all rows)
e057df02SPaul Mullowney-  nnz - array containing the number of nonzeros in the various rows
0298fd71SBarry Smith         (possibly different for each row) or NULL
e057df02SPaul Mullowney
e057df02SPaul Mullowney   Output Parameter:
e057df02SPaul Mullowney.  A - the matrix
e057df02SPaul Mullowney
e057df02SPaul Mullowney   It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
e057df02SPaul Mullowney   MatXXXXSetPreallocation() paradigm instead of this routine directly.
e057df02SPaul Mullowney   [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
e057df02SPaul Mullowney
e057df02SPaul Mullowney   Notes:
e057df02SPaul Mullowney   If nnz is given then nz is ignored
e057df02SPaul Mullowney
e057df02SPaul Mullowney   The AIJ format (also called the Yale sparse matrix format or
e057df02SPaul Mullowney   compressed row storage), is fully compatible with standard Fortran 77
e057df02SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
e057df02SPaul Mullowney   either one (as in Fortran) or zero.  See the users' manual for details.
e057df02SPaul Mullowney
e057df02SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
0298fd71SBarry Smith   Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
e057df02SPaul Mullowney   allocation.  For large problems you MUST preallocate memory or you
e057df02SPaul Mullowney   will get TERRIBLE performance, see the users' manual chapter on matrices.
e057df02SPaul Mullowney
e057df02SPaul Mullowney   By default, this format uses inodes (identical nodes) when possible, to
e057df02SPaul Mullowney   improve numerical efficiency of matrix-vector products and solves. We
e057df02SPaul Mullowney   search for consecutive rows with the same nonzero structure, thereby
e057df02SPaul Mullowney   reusing matrix information to achieve increased efficiency.
e057df02SPaul Mullowney
e057df02SPaul Mullowney   Level: intermediate
e057df02SPaul Mullowney
e057df02SPaul Mullowney.seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
e057df02SPaul Mullowney@*/
9ae82921SPaul MullowneyPetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
9ae82921SPaul Mullowney{
9ae82921SPaul Mullowney  PetscErrorCode ierr;
9ae82921SPaul Mullowney  PetscMPIInt    size;
9ae82921SPaul Mullowney
9ae82921SPaul Mullowney  PetscFunctionBegin;
9ae82921SPaul Mullowney  ierr = MatCreate(comm,A);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
9ae82921SPaul Mullowney  ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
9ae82921SPaul Mullowney  if (size > 1) {
9ae82921SPaul Mullowney    ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
9ae82921SPaul Mullowney    ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
9ae82921SPaul Mullowney  } else {
9ae82921SPaul Mullowney    ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
9ae82921SPaul Mullowney    ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
9ae82921SPaul Mullowney  }
9ae82921SPaul Mullowney  PetscFunctionReturn(0);
9ae82921SPaul Mullowney}
9ae82921SPaul Mullowney
3ca39a21SBarry Smith/*MC
e057df02SPaul Mullowney   MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
e057df02SPaul Mullowney
2692e278SPaul Mullowney   A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2692e278SPaul Mullowney   CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2692e278SPaul Mullowney   All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
9ae82921SPaul Mullowney
9ae82921SPaul Mullowney   This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
9ae82921SPaul Mullowney   and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
9ae82921SPaul Mullowney   MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
9ae82921SPaul Mullowney   for communicators controlling multiple processes.  It is recommended that you call both of
9ae82921SPaul Mullowney   the above preallocation routines for simplicity.
9ae82921SPaul Mullowney
9ae82921SPaul Mullowney   Options Database Keys:
e057df02SPaul Mullowney+  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
8468deeeSKarl Rupp.  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8468deeeSKarl Rupp.  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8468deeeSKarl Rupp-  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
9ae82921SPaul Mullowney
9ae82921SPaul Mullowney  Level: beginner
9ae82921SPaul Mullowney
8468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
8468deeeSKarl RuppM
9ae82921SPaul MullowneyM*/