1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 20fd2b57fSKarl Rupp 33d13b8fdSMatthew G. Knepley #include <petscconf.h> 49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 53d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 69ae82921SPaul Mullowney 79ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 89ae82921SPaul Mullowney { 9bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 10bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 119ae82921SPaul Mullowney PetscErrorCode ierr; 129ae82921SPaul Mullowney PetscInt i; 139ae82921SPaul Mullowney 149ae82921SPaul Mullowney PetscFunctionBegin; 159ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 169ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 179ae82921SPaul Mullowney if (d_nnz) { 189ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 199ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 209ae82921SPaul Mullowney } 219ae82921SPaul Mullowney } 229ae82921SPaul Mullowney if (o_nnz) { 239ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 249ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 259ae82921SPaul Mullowney } 269ae82921SPaul Mullowney } 279ae82921SPaul Mullowney if (!B->preallocated) { 28bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 299ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 309ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 319ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 323bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 339ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 349ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 359ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 363bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 379ae82921SPaul Mullowney } 389ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 399ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 40e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 41e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 42b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 43b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 44b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 45b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 462205254eSKarl Rupp 479ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 489ae82921SPaul Mullowney PetscFunctionReturn(0); 499ae82921SPaul Mullowney } 50e057df02SPaul Mullowney 512a7a6963SBarry Smith PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 529ae82921SPaul Mullowney { 539ae82921SPaul Mullowney PetscErrorCode ierr; 5433d57670SJed Brown PetscInt rbs,cbs; 559ae82921SPaul Mullowney 569ae82921SPaul Mullowney PetscFunctionBegin; 5733d57670SJed Brown ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 589ae82921SPaul Mullowney if (right) { 59ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 609ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 6133d57670SJed Brown ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 62c41cb2e2SAlejandro Lamas Daviña ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr); 63cbf1f8acSSatish Balay ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 649ae82921SPaul Mullowney } 659ae82921SPaul Mullowney if (left) { 66ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 679ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 6833d57670SJed Brown ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 69c41cb2e2SAlejandro Lamas Daviña ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr); 70cbf1f8acSSatish Balay ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 71cbf1f8acSSatish Balay 72cbf1f8acSSatish Balay 739ae82921SPaul Mullowney } 749ae82921SPaul Mullowney PetscFunctionReturn(0); 759ae82921SPaul Mullowney } 769ae82921SPaul Mullowney 779ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 789ae82921SPaul Mullowney { 79e057df02SPaul Mullowney /* This multiplication sequence is different sequence 80e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 81e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 82e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 83e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 84e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 85e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 86e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 87e057df02SPaul Mullowney against race conditions. 88e057df02SPaul Mullowney 89e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 909ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 919ae82921SPaul Mullowney PetscErrorCode ierr; 929ae82921SPaul Mullowney PetscInt nt; 939ae82921SPaul Mullowney 949ae82921SPaul Mullowney PetscFunctionBegin; 959ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 969ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 979ae82921SPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 989ae82921SPaul Mullowney ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 999ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1009ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1019ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 1029ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 1039ae82921SPaul Mullowney PetscFunctionReturn(0); 1049ae82921SPaul Mullowney } 105ca45077fSPaul Mullowney 106ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 107ca45077fSPaul Mullowney { 108e057df02SPaul Mullowney /* This multiplication sequence is different sequence 109e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 110e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 111e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 112e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 113e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 114e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 115e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 116e057df02SPaul Mullowney against race conditions. 117e057df02SPaul Mullowney 118e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 119ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 120ca45077fSPaul Mullowney PetscErrorCode ierr; 121ca45077fSPaul Mullowney PetscInt nt; 122ca45077fSPaul Mullowney 123ca45077fSPaul Mullowney PetscFunctionBegin; 124ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 125ca45077fSPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 126*9b2db222SKarl Rupp ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr); 127*9b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 128ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 129*9b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 130*9b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 131ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 132ca45077fSPaul Mullowney PetscFunctionReturn(0); 133ca45077fSPaul Mullowney } 1349ae82921SPaul Mullowney 135e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 136ca45077fSPaul Mullowney { 137ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 138bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 139e057df02SPaul Mullowney 140ca45077fSPaul Mullowney PetscFunctionBegin; 141e057df02SPaul Mullowney switch (op) { 142e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 143e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 144045c96e1SPaul Mullowney break; 145e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 146e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 147045c96e1SPaul Mullowney break; 148e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 149e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 150e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 151045c96e1SPaul Mullowney break; 152e057df02SPaul Mullowney default: 153e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 154045c96e1SPaul Mullowney } 155ca45077fSPaul Mullowney PetscFunctionReturn(0); 156ca45077fSPaul Mullowney } 157e057df02SPaul Mullowney 1584416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 159e057df02SPaul Mullowney { 160e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 161e057df02SPaul Mullowney PetscErrorCode ierr; 162e057df02SPaul Mullowney PetscBool flg; 163a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 164a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1655fd66863SKarl Rupp 166e057df02SPaul Mullowney PetscFunctionBegin; 167e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 168e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 169e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 170e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 171a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 172e057df02SPaul Mullowney if (flg) { 173e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 174e057df02SPaul Mullowney } 175e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 176a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 177e057df02SPaul Mullowney if (flg) { 178e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 179e057df02SPaul Mullowney } 180e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 181a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182e057df02SPaul Mullowney if (flg) { 183e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 184e057df02SPaul Mullowney } 185e057df02SPaul Mullowney } 186e057df02SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 187e057df02SPaul Mullowney PetscFunctionReturn(0); 188e057df02SPaul Mullowney } 189e057df02SPaul Mullowney 19034d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 19134d6c7a5SJose E. Roman { 19234d6c7a5SJose E. Roman PetscErrorCode ierr; 19334d6c7a5SJose E. Roman Mat_MPIAIJ *mpiaij; 19434d6c7a5SJose E. Roman 19534d6c7a5SJose E. Roman PetscFunctionBegin; 19634d6c7a5SJose E. Roman mpiaij = (Mat_MPIAIJ*)A->data; 19734d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 19834d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 19934d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 20034d6c7a5SJose E. Roman } 20134d6c7a5SJose E. Roman PetscFunctionReturn(0); 20234d6c7a5SJose E. Roman } 20334d6c7a5SJose E. Roman 204bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 205bbf3fe20SPaul Mullowney { 206bbf3fe20SPaul Mullowney PetscErrorCode ierr; 207bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 208bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 209b06137fdSPaul Mullowney cudaError_t err; 210b06137fdSPaul Mullowney cusparseStatus_t stat; 211bbf3fe20SPaul Mullowney 212bbf3fe20SPaul Mullowney PetscFunctionBegin; 213bbf3fe20SPaul Mullowney try { 214b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 215b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 216c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 217c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 218bbf3fe20SPaul Mullowney delete cusparseStruct; 219bbf3fe20SPaul Mullowney } catch(char *ex) { 220bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 221bbf3fe20SPaul Mullowney } 222bbf3fe20SPaul Mullowney cusparseStruct = 0; 2232205254eSKarl Rupp 224bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 225bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 226bbf3fe20SPaul Mullowney } 227ca45077fSPaul Mullowney 2288cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 2299ae82921SPaul Mullowney { 2309ae82921SPaul Mullowney PetscErrorCode ierr; 231bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 232bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct; 233b06137fdSPaul Mullowney cudaError_t err; 234b06137fdSPaul Mullowney cusparseStatus_t stat; 2359ae82921SPaul Mullowney 2369ae82921SPaul Mullowney PetscFunctionBegin; 237bbf3fe20SPaul Mullowney ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 238bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 239bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 240bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2412205254eSKarl Rupp 242bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 243e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 244e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 245c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 246c41cb2e2SAlejandro Lamas Daviña err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 2472205254eSKarl Rupp 24834d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 2492a7a6963SBarry Smith A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 250bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 251bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 252bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 253bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2542205254eSKarl Rupp 255bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 256bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2579ae82921SPaul Mullowney PetscFunctionReturn(0); 2589ae82921SPaul Mullowney } 2599ae82921SPaul Mullowney 2603f9c0db1SPaul Mullowney /*@ 2613f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 262e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2633f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 264e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 265e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 266e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 2679ae82921SPaul Mullowney 268e057df02SPaul Mullowney Collective on MPI_Comm 269e057df02SPaul Mullowney 270e057df02SPaul Mullowney Input Parameters: 271e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 272e057df02SPaul Mullowney . m - number of rows 273e057df02SPaul Mullowney . n - number of columns 274e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 275e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 2760298fd71SBarry Smith (possibly different for each row) or NULL 277e057df02SPaul Mullowney 278e057df02SPaul Mullowney Output Parameter: 279e057df02SPaul Mullowney . A - the matrix 280e057df02SPaul Mullowney 281e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 282e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 283e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 284e057df02SPaul Mullowney 285e057df02SPaul Mullowney Notes: 286e057df02SPaul Mullowney If nnz is given then nz is ignored 287e057df02SPaul Mullowney 288e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 289e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 290e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 291e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 292e057df02SPaul Mullowney 293e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 2940298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 295e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 296e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 297e057df02SPaul Mullowney 298e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 299e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 300e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 301e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 302e057df02SPaul Mullowney 303e057df02SPaul Mullowney Level: intermediate 304e057df02SPaul Mullowney 305e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 306e057df02SPaul Mullowney @*/ 3079ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3089ae82921SPaul Mullowney { 3099ae82921SPaul Mullowney PetscErrorCode ierr; 3109ae82921SPaul Mullowney PetscMPIInt size; 3119ae82921SPaul Mullowney 3129ae82921SPaul Mullowney PetscFunctionBegin; 3139ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3149ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3159ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3169ae82921SPaul Mullowney if (size > 1) { 3179ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3189ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3199ae82921SPaul Mullowney } else { 3209ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3219ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3229ae82921SPaul Mullowney } 3239ae82921SPaul Mullowney PetscFunctionReturn(0); 3249ae82921SPaul Mullowney } 3259ae82921SPaul Mullowney 3263ca39a21SBarry Smith /*MC 327e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 328e057df02SPaul Mullowney 3292692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3302692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3312692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3329ae82921SPaul Mullowney 3339ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3349ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3359ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3369ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3379ae82921SPaul Mullowney the above preallocation routines for simplicity. 3389ae82921SPaul Mullowney 3399ae82921SPaul Mullowney Options Database Keys: 340e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3418468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3428468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3438468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3449ae82921SPaul Mullowney 3459ae82921SPaul Mullowney Level: beginner 3469ae82921SPaul Mullowney 3478468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3488468deeeSKarl Rupp M 3499ae82921SPaul Mullowney M*/ 350