1*0fd2b57fSKarl Rupp #define PETSC_SKIP_COMPLEX 2*0fd2b57fSKarl Rupp 33d13b8fdSMatthew G. Knepley #include <petscconf.h> 42327d61dSSatish Balay PETSC_CUDA_EXTERN_C_BEGIN 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 62327d61dSSatish Balay PETSC_CUDA_EXTERN_C_END 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 89ae82921SPaul Mullowney 9b06137fdSPaul Mullowney 109ae82921SPaul Mullowney #undef __FUNCT__ 119ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 129ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 139ae82921SPaul Mullowney { 14bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 15bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 169ae82921SPaul Mullowney PetscErrorCode ierr; 179ae82921SPaul Mullowney PetscInt i; 189ae82921SPaul Mullowney 199ae82921SPaul Mullowney PetscFunctionBegin; 209ae82921SPaul Mullowney if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 219ae82921SPaul Mullowney if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 229ae82921SPaul Mullowney if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 239ae82921SPaul Mullowney if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 249ae82921SPaul Mullowney 259ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 269ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 279ae82921SPaul Mullowney if (d_nnz) { 289ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 299ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 309ae82921SPaul Mullowney } 319ae82921SPaul Mullowney } 329ae82921SPaul Mullowney if (o_nnz) { 339ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 349ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 359ae82921SPaul Mullowney } 369ae82921SPaul Mullowney } 379ae82921SPaul Mullowney if (!B->preallocated) { 38bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 399ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 409ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 419ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 423bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 439ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 449ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 459ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 463bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 479ae82921SPaul Mullowney } 489ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 499ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 50e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 51e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 52b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 53b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 54b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 55b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 562205254eSKarl Rupp 579ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 589ae82921SPaul Mullowney PetscFunctionReturn(0); 599ae82921SPaul Mullowney } 60e057df02SPaul Mullowney 619ae82921SPaul Mullowney #undef __FUNCT__ 629ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 639ae82921SPaul Mullowney PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 649ae82921SPaul Mullowney { 659ae82921SPaul Mullowney PetscErrorCode ierr; 6633d57670SJed Brown PetscInt rbs,cbs; 679ae82921SPaul Mullowney 689ae82921SPaul Mullowney PetscFunctionBegin; 6933d57670SJed Brown ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 709ae82921SPaul Mullowney if (right) { 71ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 729ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 7333d57670SJed Brown ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 749ae82921SPaul Mullowney ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 75cbf1f8acSSatish Balay ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 769ae82921SPaul Mullowney } 779ae82921SPaul Mullowney if (left) { 78ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 799ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 8033d57670SJed Brown ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 819ae82921SPaul Mullowney ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 82cbf1f8acSSatish Balay ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 83cbf1f8acSSatish Balay 84cbf1f8acSSatish Balay 859ae82921SPaul Mullowney } 869ae82921SPaul Mullowney PetscFunctionReturn(0); 879ae82921SPaul Mullowney } 889ae82921SPaul Mullowney 899ae82921SPaul Mullowney 909ae82921SPaul Mullowney #undef __FUNCT__ 919ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 929ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 939ae82921SPaul Mullowney { 94e057df02SPaul Mullowney /* This multiplication sequence is different sequence 95e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 96e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 97e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 98e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 99e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 100e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 101e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 102e057df02SPaul Mullowney against race conditions. 103e057df02SPaul Mullowney 104e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 1059ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1069ae82921SPaul Mullowney PetscErrorCode ierr; 1079ae82921SPaul Mullowney PetscInt nt; 1089ae82921SPaul Mullowney 1099ae82921SPaul Mullowney PetscFunctionBegin; 1109ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 1119ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 1129ae82921SPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 1139ae82921SPaul Mullowney ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 1149ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1159ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1169ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 1179ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 1189ae82921SPaul Mullowney PetscFunctionReturn(0); 1199ae82921SPaul Mullowney } 120ca45077fSPaul Mullowney 121ca45077fSPaul Mullowney #undef __FUNCT__ 122ca45077fSPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 123ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 124ca45077fSPaul Mullowney { 125e057df02SPaul Mullowney /* This multiplication sequence is different sequence 126e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 127e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 128e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 129e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 130e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 131e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 132e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 133e057df02SPaul Mullowney against race conditions. 134e057df02SPaul Mullowney 135e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 136ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 137ca45077fSPaul Mullowney PetscErrorCode ierr; 138ca45077fSPaul Mullowney PetscInt nt; 139ca45077fSPaul Mullowney 140ca45077fSPaul Mullowney PetscFunctionBegin; 141ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 142ca45077fSPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 143ca45077fSPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 144ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 145ca45077fSPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 146ca45077fSPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 147ca45077fSPaul Mullowney ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 148ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 149ca45077fSPaul Mullowney PetscFunctionReturn(0); 150ca45077fSPaul Mullowney } 1519ae82921SPaul Mullowney 152ca45077fSPaul Mullowney #undef __FUNCT__ 153e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 154e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 155ca45077fSPaul Mullowney { 156ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 157bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 158e057df02SPaul Mullowney 159ca45077fSPaul Mullowney PetscFunctionBegin; 160e057df02SPaul Mullowney switch (op) { 161e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 162e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 163045c96e1SPaul Mullowney break; 164e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 165e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 166045c96e1SPaul Mullowney break; 167e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 168e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 169e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 170045c96e1SPaul Mullowney break; 171e057df02SPaul Mullowney default: 172e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 173045c96e1SPaul Mullowney } 174ca45077fSPaul Mullowney PetscFunctionReturn(0); 175ca45077fSPaul Mullowney } 176e057df02SPaul Mullowney 177e057df02SPaul Mullowney #undef __FUNCT__ 178e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 179e057df02SPaul Mullowney PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 180e057df02SPaul Mullowney { 181e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 182e057df02SPaul Mullowney PetscErrorCode ierr; 183e057df02SPaul Mullowney PetscBool flg; 1845fd66863SKarl Rupp 185e057df02SPaul Mullowney PetscFunctionBegin; 186e057df02SPaul Mullowney ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 187e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 188e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 189e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 1907083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 191e057df02SPaul Mullowney if (flg) { 192e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 193e057df02SPaul Mullowney } 194e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 1957083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196e057df02SPaul Mullowney if (flg) { 197e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 198e057df02SPaul Mullowney } 199e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 2007083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 201e057df02SPaul Mullowney if (flg) { 202e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 203e057df02SPaul Mullowney } 204e057df02SPaul Mullowney } 205e057df02SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 206e057df02SPaul Mullowney PetscFunctionReturn(0); 207e057df02SPaul Mullowney } 208e057df02SPaul Mullowney 209bbf3fe20SPaul Mullowney #undef __FUNCT__ 210bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 211bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 212bbf3fe20SPaul Mullowney { 213bbf3fe20SPaul Mullowney PetscErrorCode ierr; 214bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 215bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 216b06137fdSPaul Mullowney cudaError_t err; 217b06137fdSPaul Mullowney cusparseStatus_t stat; 218bbf3fe20SPaul Mullowney 219bbf3fe20SPaul Mullowney PetscFunctionBegin; 220bbf3fe20SPaul Mullowney try { 221b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 222b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 223b06137fdSPaul Mullowney stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat); 224b06137fdSPaul Mullowney err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err); 225bbf3fe20SPaul Mullowney delete cusparseStruct; 226bbf3fe20SPaul Mullowney } catch(char *ex) { 227bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 228bbf3fe20SPaul Mullowney } 229bbf3fe20SPaul Mullowney cusparseStruct = 0; 2302205254eSKarl Rupp 231bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 232bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 233bbf3fe20SPaul Mullowney } 234ca45077fSPaul Mullowney 2359ae82921SPaul Mullowney #undef __FUNCT__ 2369ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 2378cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 2389ae82921SPaul Mullowney { 2399ae82921SPaul Mullowney PetscErrorCode ierr; 240bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 241bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct; 242b06137fdSPaul Mullowney cudaError_t err; 243b06137fdSPaul Mullowney cusparseStatus_t stat; 2449ae82921SPaul Mullowney 2459ae82921SPaul Mullowney PetscFunctionBegin; 246bbf3fe20SPaul Mullowney ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 247bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 248bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 249bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2502205254eSKarl Rupp 251bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 252e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 253e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 254b06137fdSPaul Mullowney stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat); 255b06137fdSPaul Mullowney err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err); 2562205254eSKarl Rupp 257bbf3fe20SPaul Mullowney A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 258bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 259bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 260bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 261bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2622205254eSKarl Rupp 263bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 264bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2659ae82921SPaul Mullowney PetscFunctionReturn(0); 2669ae82921SPaul Mullowney } 2679ae82921SPaul Mullowney 2683f9c0db1SPaul Mullowney /*@ 2693f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 270e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2713f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 272e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 273e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 274e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 2759ae82921SPaul Mullowney 276e057df02SPaul Mullowney Collective on MPI_Comm 277e057df02SPaul Mullowney 278e057df02SPaul Mullowney Input Parameters: 279e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 280e057df02SPaul Mullowney . m - number of rows 281e057df02SPaul Mullowney . n - number of columns 282e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 283e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 2840298fd71SBarry Smith (possibly different for each row) or NULL 285e057df02SPaul Mullowney 286e057df02SPaul Mullowney Output Parameter: 287e057df02SPaul Mullowney . A - the matrix 288e057df02SPaul Mullowney 289e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 290e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 291e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 292e057df02SPaul Mullowney 293e057df02SPaul Mullowney Notes: 294e057df02SPaul Mullowney If nnz is given then nz is ignored 295e057df02SPaul Mullowney 296e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 297e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 298e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 299e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 300e057df02SPaul Mullowney 301e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 3020298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 303e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 304e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 305e057df02SPaul Mullowney 306e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 307e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 308e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 309e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 310e057df02SPaul Mullowney 311e057df02SPaul Mullowney Level: intermediate 312e057df02SPaul Mullowney 313e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 314e057df02SPaul Mullowney @*/ 3159ae82921SPaul Mullowney #undef __FUNCT__ 3169ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE" 3179ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3189ae82921SPaul Mullowney { 3199ae82921SPaul Mullowney PetscErrorCode ierr; 3209ae82921SPaul Mullowney PetscMPIInt size; 3219ae82921SPaul Mullowney 3229ae82921SPaul Mullowney PetscFunctionBegin; 3239ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3249ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3259ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3269ae82921SPaul Mullowney if (size > 1) { 3279ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3289ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3299ae82921SPaul Mullowney } else { 3309ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3319ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3329ae82921SPaul Mullowney } 3339ae82921SPaul Mullowney PetscFunctionReturn(0); 3349ae82921SPaul Mullowney } 3359ae82921SPaul Mullowney 3363f9c0db1SPaul Mullowney /*M 337e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 338e057df02SPaul Mullowney 3392692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3402692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3412692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3429ae82921SPaul Mullowney 3439ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3449ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3459ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3469ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3479ae82921SPaul Mullowney the above preallocation routines for simplicity. 3489ae82921SPaul Mullowney 3499ae82921SPaul Mullowney Options Database Keys: 350e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3518468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3528468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3538468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3549ae82921SPaul Mullowney 3559ae82921SPaul Mullowney Level: beginner 3569ae82921SPaul Mullowney 3578468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3588468deeeSKarl Rupp M 3599ae82921SPaul Mullowney M*/ 360