1*3d13b8fdSMatthew G. Knepley #include <petscconf.h> 22327d61dSSatish Balay PETSC_CUDA_EXTERN_C_BEGIN 39ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 42327d61dSSatish Balay PETSC_CUDA_EXTERN_C_END 5*3d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 69ae82921SPaul Mullowney 7b06137fdSPaul Mullowney 89ae82921SPaul Mullowney #undef __FUNCT__ 99ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 109ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 119ae82921SPaul Mullowney { 12bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 149ae82921SPaul Mullowney PetscErrorCode ierr; 159ae82921SPaul Mullowney PetscInt i; 169ae82921SPaul Mullowney 179ae82921SPaul Mullowney PetscFunctionBegin; 189ae82921SPaul Mullowney if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 199ae82921SPaul Mullowney if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 209ae82921SPaul Mullowney if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 219ae82921SPaul Mullowney if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 229ae82921SPaul Mullowney 239ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 249ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 259ae82921SPaul Mullowney if (d_nnz) { 269ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 279ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 289ae82921SPaul Mullowney } 299ae82921SPaul Mullowney } 309ae82921SPaul Mullowney if (o_nnz) { 319ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 329ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 339ae82921SPaul Mullowney } 349ae82921SPaul Mullowney } 359ae82921SPaul Mullowney if (!B->preallocated) { 36bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 379ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 389ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 399ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 403bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 419ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 429ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 439ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 443bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 459ae82921SPaul Mullowney } 469ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 479ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 48e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 49e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 50b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 51b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 52b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 53b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 542205254eSKarl Rupp 559ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 569ae82921SPaul Mullowney PetscFunctionReturn(0); 579ae82921SPaul Mullowney } 58e057df02SPaul Mullowney 599ae82921SPaul Mullowney #undef __FUNCT__ 609ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 619ae82921SPaul Mullowney PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 629ae82921SPaul Mullowney { 639ae82921SPaul Mullowney PetscErrorCode ierr; 649ae82921SPaul Mullowney 659ae82921SPaul Mullowney PetscFunctionBegin; 669ae82921SPaul Mullowney if (right) { 67ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 689ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 699ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 709ae82921SPaul Mullowney ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 71cbf1f8acSSatish Balay ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 729ae82921SPaul Mullowney } 739ae82921SPaul Mullowney if (left) { 74ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 759ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 769ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 779ae82921SPaul Mullowney ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 78cbf1f8acSSatish Balay ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 79cbf1f8acSSatish Balay 80cbf1f8acSSatish Balay 819ae82921SPaul Mullowney } 829ae82921SPaul Mullowney PetscFunctionReturn(0); 839ae82921SPaul Mullowney } 849ae82921SPaul Mullowney 859ae82921SPaul Mullowney 869ae82921SPaul Mullowney #undef __FUNCT__ 879ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 889ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 899ae82921SPaul Mullowney { 90e057df02SPaul Mullowney /* This multiplication sequence is different sequence 91e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 92e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 93e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 94e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 95e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 96e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 97e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 98e057df02SPaul Mullowney against race conditions. 99e057df02SPaul Mullowney 100e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 1019ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1029ae82921SPaul Mullowney PetscErrorCode ierr; 1039ae82921SPaul Mullowney PetscInt nt; 1049ae82921SPaul Mullowney 1059ae82921SPaul Mullowney PetscFunctionBegin; 1069ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 1079ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 1089ae82921SPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 1099ae82921SPaul Mullowney ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 1109ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1119ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1129ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 1139ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 1149ae82921SPaul Mullowney PetscFunctionReturn(0); 1159ae82921SPaul Mullowney } 116ca45077fSPaul Mullowney 117ca45077fSPaul Mullowney #undef __FUNCT__ 118ca45077fSPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 119ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 120ca45077fSPaul Mullowney { 121e057df02SPaul Mullowney /* This multiplication sequence is different sequence 122e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 123e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 124e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 125e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 126e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 127e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 128e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 129e057df02SPaul Mullowney against race conditions. 130e057df02SPaul Mullowney 131e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 132ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 133ca45077fSPaul Mullowney PetscErrorCode ierr; 134ca45077fSPaul Mullowney PetscInt nt; 135ca45077fSPaul Mullowney 136ca45077fSPaul Mullowney PetscFunctionBegin; 137ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 138ca45077fSPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 139ca45077fSPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 140ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 141ca45077fSPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 142ca45077fSPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 143ca45077fSPaul Mullowney ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 144ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 145ca45077fSPaul Mullowney PetscFunctionReturn(0); 146ca45077fSPaul Mullowney } 1479ae82921SPaul Mullowney 148ca45077fSPaul Mullowney #undef __FUNCT__ 149e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 150e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 151ca45077fSPaul Mullowney { 152ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 153bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 154e057df02SPaul Mullowney 155ca45077fSPaul Mullowney PetscFunctionBegin; 156e057df02SPaul Mullowney switch (op) { 157e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 158e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 159045c96e1SPaul Mullowney break; 160e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 161e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 162045c96e1SPaul Mullowney break; 163e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 164e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 165e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 166045c96e1SPaul Mullowney break; 167e057df02SPaul Mullowney default: 168e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 169045c96e1SPaul Mullowney } 170ca45077fSPaul Mullowney PetscFunctionReturn(0); 171ca45077fSPaul Mullowney } 172e057df02SPaul Mullowney 173e057df02SPaul Mullowney #undef __FUNCT__ 174e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 175e057df02SPaul Mullowney PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 176e057df02SPaul Mullowney { 177e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 178e057df02SPaul Mullowney PetscErrorCode ierr; 179e057df02SPaul Mullowney PetscBool flg; 1805fd66863SKarl Rupp 181e057df02SPaul Mullowney PetscFunctionBegin; 182e057df02SPaul Mullowney ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 183e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 184e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 185e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 1867083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187e057df02SPaul Mullowney if (flg) { 188e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 189e057df02SPaul Mullowney } 190e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 1917083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 192e057df02SPaul Mullowney if (flg) { 193e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 194e057df02SPaul Mullowney } 195e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 1967083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197e057df02SPaul Mullowney if (flg) { 198e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 199e057df02SPaul Mullowney } 200e057df02SPaul Mullowney } 201e057df02SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 202e057df02SPaul Mullowney PetscFunctionReturn(0); 203e057df02SPaul Mullowney } 204e057df02SPaul Mullowney 205bbf3fe20SPaul Mullowney #undef __FUNCT__ 206bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 207bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 208bbf3fe20SPaul Mullowney { 209bbf3fe20SPaul Mullowney PetscErrorCode ierr; 210bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 211bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 212b06137fdSPaul Mullowney cudaError_t err; 213b06137fdSPaul Mullowney cusparseStatus_t stat; 214bbf3fe20SPaul Mullowney 215bbf3fe20SPaul Mullowney PetscFunctionBegin; 216bbf3fe20SPaul Mullowney try { 217b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 218b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 219b06137fdSPaul Mullowney stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat); 220b06137fdSPaul Mullowney err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err); 221bbf3fe20SPaul Mullowney delete cusparseStruct; 222bbf3fe20SPaul Mullowney } catch(char *ex) { 223bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 224bbf3fe20SPaul Mullowney } 225bbf3fe20SPaul Mullowney cusparseStruct = 0; 2262205254eSKarl Rupp 227bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 228bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 229bbf3fe20SPaul Mullowney } 230ca45077fSPaul Mullowney 2319ae82921SPaul Mullowney #undef __FUNCT__ 2329ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 2338cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 2349ae82921SPaul Mullowney { 2359ae82921SPaul Mullowney PetscErrorCode ierr; 236bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 237bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct; 238b06137fdSPaul Mullowney cudaError_t err; 239b06137fdSPaul Mullowney cusparseStatus_t stat; 2409ae82921SPaul Mullowney 2419ae82921SPaul Mullowney PetscFunctionBegin; 242bbf3fe20SPaul Mullowney ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 243bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 244bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 245bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2462205254eSKarl Rupp 247bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 248e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 249e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 250b06137fdSPaul Mullowney stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat); 251b06137fdSPaul Mullowney err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err); 2522205254eSKarl Rupp 253bbf3fe20SPaul Mullowney A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 254bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 255bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 256bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 257bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2582205254eSKarl Rupp 259bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 260bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2619ae82921SPaul Mullowney PetscFunctionReturn(0); 2629ae82921SPaul Mullowney } 2639ae82921SPaul Mullowney 2643f9c0db1SPaul Mullowney /*@ 2653f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 266e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2673f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 268e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 269e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 270e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 2719ae82921SPaul Mullowney 272e057df02SPaul Mullowney Collective on MPI_Comm 273e057df02SPaul Mullowney 274e057df02SPaul Mullowney Input Parameters: 275e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 276e057df02SPaul Mullowney . m - number of rows 277e057df02SPaul Mullowney . n - number of columns 278e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 279e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 2800298fd71SBarry Smith (possibly different for each row) or NULL 281e057df02SPaul Mullowney 282e057df02SPaul Mullowney Output Parameter: 283e057df02SPaul Mullowney . A - the matrix 284e057df02SPaul Mullowney 285e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 286e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 287e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 288e057df02SPaul Mullowney 289e057df02SPaul Mullowney Notes: 290e057df02SPaul Mullowney If nnz is given then nz is ignored 291e057df02SPaul Mullowney 292e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 293e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 294e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 295e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 296e057df02SPaul Mullowney 297e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 2980298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 299e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 300e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 301e057df02SPaul Mullowney 302e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 303e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 304e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 305e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 306e057df02SPaul Mullowney 307e057df02SPaul Mullowney Level: intermediate 308e057df02SPaul Mullowney 309e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 310e057df02SPaul Mullowney @*/ 3119ae82921SPaul Mullowney #undef __FUNCT__ 3129ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE" 3139ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3149ae82921SPaul Mullowney { 3159ae82921SPaul Mullowney PetscErrorCode ierr; 3169ae82921SPaul Mullowney PetscMPIInt size; 3179ae82921SPaul Mullowney 3189ae82921SPaul Mullowney PetscFunctionBegin; 3199ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3209ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3219ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3229ae82921SPaul Mullowney if (size > 1) { 3239ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3249ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3259ae82921SPaul Mullowney } else { 3269ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3279ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3289ae82921SPaul Mullowney } 3299ae82921SPaul Mullowney PetscFunctionReturn(0); 3309ae82921SPaul Mullowney } 3319ae82921SPaul Mullowney 3323f9c0db1SPaul Mullowney /*M 333e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 334e057df02SPaul Mullowney 3352692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3362692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3372692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3389ae82921SPaul Mullowney 3399ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3409ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3419ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3429ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3439ae82921SPaul Mullowney the above preallocation routines for simplicity. 3449ae82921SPaul Mullowney 3459ae82921SPaul Mullowney Options Database Keys: 346e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3478468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3488468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3498468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3509ae82921SPaul Mullowney 3519ae82921SPaul Mullowney Level: beginner 3529ae82921SPaul Mullowney 3538468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3548468deeeSKarl Rupp M 3559ae82921SPaul Mullowney M*/ 356