1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 20fd2b57fSKarl Rupp 33d13b8fdSMatthew G. Knepley #include <petscconf.h> 49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 53d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 69ae82921SPaul Mullowney 7b06137fdSPaul Mullowney 89ae82921SPaul Mullowney #undef __FUNCT__ 99ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 109ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 119ae82921SPaul Mullowney { 12bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 149ae82921SPaul Mullowney PetscErrorCode ierr; 159ae82921SPaul Mullowney PetscInt i; 169ae82921SPaul Mullowney 179ae82921SPaul Mullowney PetscFunctionBegin; 189ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 199ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 209ae82921SPaul Mullowney if (d_nnz) { 219ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 229ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239ae82921SPaul Mullowney } 249ae82921SPaul Mullowney } 259ae82921SPaul Mullowney if (o_nnz) { 269ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 279ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 289ae82921SPaul Mullowney } 299ae82921SPaul Mullowney } 309ae82921SPaul Mullowney if (!B->preallocated) { 31bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 329ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 339ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 349ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 353bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 369ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 379ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 389ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 393bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 409ae82921SPaul Mullowney } 419ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 429ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 43e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 44e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 45b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 46b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 47b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 48b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 492205254eSKarl Rupp 509ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 519ae82921SPaul Mullowney PetscFunctionReturn(0); 529ae82921SPaul Mullowney } 53e057df02SPaul Mullowney 549ae82921SPaul Mullowney #undef __FUNCT__ 552a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE" 562a7a6963SBarry Smith PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 579ae82921SPaul Mullowney { 589ae82921SPaul Mullowney PetscErrorCode ierr; 5933d57670SJed Brown PetscInt rbs,cbs; 609ae82921SPaul Mullowney 619ae82921SPaul Mullowney PetscFunctionBegin; 6233d57670SJed Brown ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 639ae82921SPaul Mullowney if (right) { 64ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 659ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 6633d57670SJed Brown ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 67c41cb2e2SAlejandro Lamas Daviña ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr); 68cbf1f8acSSatish Balay ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 699ae82921SPaul Mullowney } 709ae82921SPaul Mullowney if (left) { 71ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 729ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 7333d57670SJed Brown ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 74c41cb2e2SAlejandro Lamas Daviña ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr); 75cbf1f8acSSatish Balay ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 76cbf1f8acSSatish Balay 77cbf1f8acSSatish Balay 789ae82921SPaul Mullowney } 799ae82921SPaul Mullowney PetscFunctionReturn(0); 809ae82921SPaul Mullowney } 819ae82921SPaul Mullowney 829ae82921SPaul Mullowney #undef __FUNCT__ 839ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 849ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 859ae82921SPaul Mullowney { 86e057df02SPaul Mullowney /* This multiplication sequence is different sequence 87e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 88e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 89e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 90e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 91e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 92e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 93e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 94e057df02SPaul Mullowney against race conditions. 95e057df02SPaul Mullowney 96e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 979ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 989ae82921SPaul Mullowney PetscErrorCode ierr; 999ae82921SPaul Mullowney PetscInt nt; 1009ae82921SPaul Mullowney 1019ae82921SPaul Mullowney PetscFunctionBegin; 1029ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 1039ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 1049ae82921SPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 1059ae82921SPaul Mullowney ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 1069ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1079ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1089ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 1099ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 1109ae82921SPaul Mullowney PetscFunctionReturn(0); 1119ae82921SPaul Mullowney } 112ca45077fSPaul Mullowney 113ca45077fSPaul Mullowney #undef __FUNCT__ 114120f8b2dSAlejandro Lamas Daviña #define __FUNCT__ "MatMultTranspose_MPIAIJCUSPARSE" 115ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 116ca45077fSPaul Mullowney { 117e057df02SPaul Mullowney /* This multiplication sequence is different sequence 118e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 119e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 120e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 121e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 122e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 123e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 124e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 125e057df02SPaul Mullowney against race conditions. 126e057df02SPaul Mullowney 127e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 128ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 129ca45077fSPaul Mullowney PetscErrorCode ierr; 130ca45077fSPaul Mullowney PetscInt nt; 131ca45077fSPaul Mullowney 132ca45077fSPaul Mullowney PetscFunctionBegin; 133ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 134ca45077fSPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 135ca45077fSPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 136ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 137ca45077fSPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 138ca45077fSPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 139ca45077fSPaul Mullowney ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 140ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 141ca45077fSPaul Mullowney PetscFunctionReturn(0); 142ca45077fSPaul Mullowney } 1439ae82921SPaul Mullowney 144ca45077fSPaul Mullowney #undef __FUNCT__ 145e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 146e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 147ca45077fSPaul Mullowney { 148ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 149bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 150e057df02SPaul Mullowney 151ca45077fSPaul Mullowney PetscFunctionBegin; 152e057df02SPaul Mullowney switch (op) { 153e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 154e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 155045c96e1SPaul Mullowney break; 156e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 157e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 158045c96e1SPaul Mullowney break; 159e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 160e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 161e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 162045c96e1SPaul Mullowney break; 163e057df02SPaul Mullowney default: 164e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 165045c96e1SPaul Mullowney } 166ca45077fSPaul Mullowney PetscFunctionReturn(0); 167ca45077fSPaul Mullowney } 168e057df02SPaul Mullowney 169e057df02SPaul Mullowney #undef __FUNCT__ 170e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 1714416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 172e057df02SPaul Mullowney { 173e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 174e057df02SPaul Mullowney PetscErrorCode ierr; 175e057df02SPaul Mullowney PetscBool flg; 176a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 177a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1785fd66863SKarl Rupp 179e057df02SPaul Mullowney PetscFunctionBegin; 180e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 181e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 182e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 183e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 184a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185e057df02SPaul Mullowney if (flg) { 186e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 187e057df02SPaul Mullowney } 188e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 189a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190e057df02SPaul Mullowney if (flg) { 191e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 192e057df02SPaul Mullowney } 193e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 194a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 195e057df02SPaul Mullowney if (flg) { 196e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 197e057df02SPaul Mullowney } 198e057df02SPaul Mullowney } 199e057df02SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 200e057df02SPaul Mullowney PetscFunctionReturn(0); 201e057df02SPaul Mullowney } 202e057df02SPaul Mullowney 203*34d6c7a5SJose E. Roman PETSC_EXTERN PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat,MatAssemblyType); 204*34d6c7a5SJose E. Roman 205*34d6c7a5SJose E. Roman #undef __FUNCT__ 206*34d6c7a5SJose E. Roman #define __FUNCT__ "MatAssemblyEnd_MPIAIJCUSPARSE" 207*34d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 208*34d6c7a5SJose E. Roman { 209*34d6c7a5SJose E. Roman PetscErrorCode ierr; 210*34d6c7a5SJose E. Roman Mat_MPIAIJ *mpiaij; 211*34d6c7a5SJose E. Roman 212*34d6c7a5SJose E. Roman PetscFunctionBegin; 213*34d6c7a5SJose E. Roman mpiaij = (Mat_MPIAIJ*)A->data; 214*34d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 215*34d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 216*34d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 217*34d6c7a5SJose E. Roman } 218*34d6c7a5SJose E. Roman PetscFunctionReturn(0); 219*34d6c7a5SJose E. Roman } 220*34d6c7a5SJose E. Roman 221bbf3fe20SPaul Mullowney #undef __FUNCT__ 222bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 223bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 224bbf3fe20SPaul Mullowney { 225bbf3fe20SPaul Mullowney PetscErrorCode ierr; 226bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 227bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 228b06137fdSPaul Mullowney cudaError_t err; 229b06137fdSPaul Mullowney cusparseStatus_t stat; 230bbf3fe20SPaul Mullowney 231bbf3fe20SPaul Mullowney PetscFunctionBegin; 232bbf3fe20SPaul Mullowney try { 233b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 234b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 235c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 236c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 237bbf3fe20SPaul Mullowney delete cusparseStruct; 238bbf3fe20SPaul Mullowney } catch(char *ex) { 239bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 240bbf3fe20SPaul Mullowney } 241bbf3fe20SPaul Mullowney cusparseStruct = 0; 2422205254eSKarl Rupp 243bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 244bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 245bbf3fe20SPaul Mullowney } 246ca45077fSPaul Mullowney 2479ae82921SPaul Mullowney #undef __FUNCT__ 2489ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 2498cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 2509ae82921SPaul Mullowney { 2519ae82921SPaul Mullowney PetscErrorCode ierr; 252bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 253bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct; 254b06137fdSPaul Mullowney cudaError_t err; 255b06137fdSPaul Mullowney cusparseStatus_t stat; 2569ae82921SPaul Mullowney 2579ae82921SPaul Mullowney PetscFunctionBegin; 258bbf3fe20SPaul Mullowney ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 259bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 260bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 261bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2622205254eSKarl Rupp 263bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 264e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 265e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 266c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 267c41cb2e2SAlejandro Lamas Daviña err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 2682205254eSKarl Rupp 269*34d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 2702a7a6963SBarry Smith A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 271bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 272bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 273bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 274bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2752205254eSKarl Rupp 276bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 277bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2789ae82921SPaul Mullowney PetscFunctionReturn(0); 2799ae82921SPaul Mullowney } 2809ae82921SPaul Mullowney 2813f9c0db1SPaul Mullowney /*@ 2823f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 283e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2843f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 285e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 286e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 287e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 2889ae82921SPaul Mullowney 289e057df02SPaul Mullowney Collective on MPI_Comm 290e057df02SPaul Mullowney 291e057df02SPaul Mullowney Input Parameters: 292e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 293e057df02SPaul Mullowney . m - number of rows 294e057df02SPaul Mullowney . n - number of columns 295e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 296e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 2970298fd71SBarry Smith (possibly different for each row) or NULL 298e057df02SPaul Mullowney 299e057df02SPaul Mullowney Output Parameter: 300e057df02SPaul Mullowney . A - the matrix 301e057df02SPaul Mullowney 302e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 303e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 304e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 305e057df02SPaul Mullowney 306e057df02SPaul Mullowney Notes: 307e057df02SPaul Mullowney If nnz is given then nz is ignored 308e057df02SPaul Mullowney 309e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 310e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 311e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 312e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 313e057df02SPaul Mullowney 314e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 3150298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 316e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 317e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 318e057df02SPaul Mullowney 319e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 320e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 321e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 322e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 323e057df02SPaul Mullowney 324e057df02SPaul Mullowney Level: intermediate 325e057df02SPaul Mullowney 326e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 327e057df02SPaul Mullowney @*/ 3289ae82921SPaul Mullowney #undef __FUNCT__ 3299ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE" 3309ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3319ae82921SPaul Mullowney { 3329ae82921SPaul Mullowney PetscErrorCode ierr; 3339ae82921SPaul Mullowney PetscMPIInt size; 3349ae82921SPaul Mullowney 3359ae82921SPaul Mullowney PetscFunctionBegin; 3369ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3379ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3389ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3399ae82921SPaul Mullowney if (size > 1) { 3409ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3419ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3429ae82921SPaul Mullowney } else { 3439ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3449ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3459ae82921SPaul Mullowney } 3469ae82921SPaul Mullowney PetscFunctionReturn(0); 3479ae82921SPaul Mullowney } 3489ae82921SPaul Mullowney 3493f9c0db1SPaul Mullowney /*M 350e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 351e057df02SPaul Mullowney 3522692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3532692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3542692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3559ae82921SPaul Mullowney 3569ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3579ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3589ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3599ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3609ae82921SPaul Mullowney the above preallocation routines for simplicity. 3619ae82921SPaul Mullowney 3629ae82921SPaul Mullowney Options Database Keys: 363e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3648468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3658468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3668468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3679ae82921SPaul Mullowney 3689ae82921SPaul Mullowney Level: beginner 3699ae82921SPaul Mullowney 3708468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3718468deeeSKarl Rupp M 3729ae82921SPaul Mullowney M*/ 373