1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 40fd2b57fSKarl Rupp 53d13b8fdSMatthew G. Knepley #include <petscconf.h> 69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 89ae82921SPaul Mullowney 99ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 109ae82921SPaul Mullowney { 11bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 139ae82921SPaul Mullowney PetscErrorCode ierr; 149ae82921SPaul Mullowney PetscInt i; 159ae82921SPaul Mullowney 169ae82921SPaul Mullowney PetscFunctionBegin; 179ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 189ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 199ae82921SPaul Mullowney if (d_nnz) { 209ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 219ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 229ae82921SPaul Mullowney } 239ae82921SPaul Mullowney } 249ae82921SPaul Mullowney if (o_nnz) { 259ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 269ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 279ae82921SPaul Mullowney } 289ae82921SPaul Mullowney } 299ae82921SPaul Mullowney if (!B->preallocated) { 30bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 319ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 32fdc842d1SBarry Smith ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr); 339ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 349ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 353bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 369ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 37fdc842d1SBarry Smith ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr); 389ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 399ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 403bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 419ae82921SPaul Mullowney } 429ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 439ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 44e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 45e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 46b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 47b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 48b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 49b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 502205254eSKarl Rupp 519ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 529ae82921SPaul Mullowney PetscFunctionReturn(0); 539ae82921SPaul Mullowney } 54e057df02SPaul Mullowney 559ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 569ae82921SPaul Mullowney { 57fdc842d1SBarry Smith /* 58fdc842d1SBarry Smith This multiplication sequence is different sequence 59e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 60e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 61e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 62e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 63e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 64e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 65e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 66e057df02SPaul Mullowney against race conditions. 67fdc842d1SBarry Smith */ 689ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 699ae82921SPaul Mullowney PetscErrorCode ierr; 709ae82921SPaul Mullowney PetscInt nt; 719ae82921SPaul Mullowney 729ae82921SPaul Mullowney PetscFunctionBegin; 739ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 749ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 75959dcdf5SJunchao Zhang ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 769ae82921SPaul Mullowney ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 779ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 789ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 799ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 809ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 819ae82921SPaul Mullowney PetscFunctionReturn(0); 829ae82921SPaul Mullowney } 83ca45077fSPaul Mullowney 84fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 85fdc842d1SBarry Smith { 86fdc842d1SBarry Smith /* 87fdc842d1SBarry Smith This multiplication sequence is different sequence 88fdc842d1SBarry Smith than the CPU version. In particular, the diagonal block 89fdc842d1SBarry Smith multiplication kernel is launched in one stream. Then, 90fdc842d1SBarry Smith in a separate stream, the data transfers from DeviceToHost 91fdc842d1SBarry Smith (with MPI messaging in between), then HostToDevice are 92fdc842d1SBarry Smith launched. Once the data transfer stream is synchronized, 93fdc842d1SBarry Smith to ensure messaging is complete, the MatMultAdd kernel 94fdc842d1SBarry Smith is launched in the original (MatMult) stream to protect 95fdc842d1SBarry Smith against race conditions. 96fdc842d1SBarry Smith */ 97fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 98fdc842d1SBarry Smith PetscErrorCode ierr; 99fdc842d1SBarry Smith PetscInt nt; 100fdc842d1SBarry Smith 101fdc842d1SBarry Smith PetscFunctionBegin; 102fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 103fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 104fdc842d1SBarry Smith ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 105fdc842d1SBarry Smith ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 106fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 107fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 108fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 109fdc842d1SBarry Smith ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 110fdc842d1SBarry Smith PetscFunctionReturn(0); 111fdc842d1SBarry Smith } 112fdc842d1SBarry Smith 113ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 114ca45077fSPaul Mullowney { 115e057df02SPaul Mullowney /* This multiplication sequence is different sequence 116e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 117e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 118e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 119e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 120e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 121e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 122e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 123e057df02SPaul Mullowney against race conditions. 124e057df02SPaul Mullowney 125e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 126ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 127ca45077fSPaul Mullowney PetscErrorCode ierr; 128ca45077fSPaul Mullowney PetscInt nt; 129ca45077fSPaul Mullowney 130ca45077fSPaul Mullowney PetscFunctionBegin; 131ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 132ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 133a3fdcf43SKarl Rupp ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 1349b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 135ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 1369b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1379b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 138ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 139ca45077fSPaul Mullowney PetscFunctionReturn(0); 140ca45077fSPaul Mullowney } 1419ae82921SPaul Mullowney 142e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 143ca45077fSPaul Mullowney { 144ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 145bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 146e057df02SPaul Mullowney 147ca45077fSPaul Mullowney PetscFunctionBegin; 148e057df02SPaul Mullowney switch (op) { 149e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 150e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 151045c96e1SPaul Mullowney break; 152e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 153e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 154045c96e1SPaul Mullowney break; 155e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 156e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 157e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 158045c96e1SPaul Mullowney break; 159e057df02SPaul Mullowney default: 160e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 161045c96e1SPaul Mullowney } 162ca45077fSPaul Mullowney PetscFunctionReturn(0); 163ca45077fSPaul Mullowney } 164e057df02SPaul Mullowney 1654416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 166e057df02SPaul Mullowney { 167e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 168e057df02SPaul Mullowney PetscErrorCode ierr; 169e057df02SPaul Mullowney PetscBool flg; 170a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 171a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1725fd66863SKarl Rupp 173e057df02SPaul Mullowney PetscFunctionBegin; 174e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 175e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 176e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 177a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 178e057df02SPaul Mullowney if (flg) { 179e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 180e057df02SPaul Mullowney } 181e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 182a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183e057df02SPaul Mullowney if (flg) { 184e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 185e057df02SPaul Mullowney } 186e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 187a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 188e057df02SPaul Mullowney if (flg) { 189e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 190e057df02SPaul Mullowney } 191e057df02SPaul Mullowney } 1920af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 193e057df02SPaul Mullowney PetscFunctionReturn(0); 194e057df02SPaul Mullowney } 195e057df02SPaul Mullowney 19634d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 19734d6c7a5SJose E. Roman { 19834d6c7a5SJose E. Roman PetscErrorCode ierr; 19934d6c7a5SJose E. Roman Mat_MPIAIJ *mpiaij; 20034d6c7a5SJose E. Roman 20134d6c7a5SJose E. Roman PetscFunctionBegin; 20234d6c7a5SJose E. Roman mpiaij = (Mat_MPIAIJ*)A->data; 20334d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 20434d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 20534d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 20634d6c7a5SJose E. Roman } 20734d6c7a5SJose E. Roman PetscFunctionReturn(0); 20834d6c7a5SJose E. Roman } 20934d6c7a5SJose E. Roman 210bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 211bbf3fe20SPaul Mullowney { 212bbf3fe20SPaul Mullowney PetscErrorCode ierr; 213bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 214bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 215b06137fdSPaul Mullowney cudaError_t err; 216b06137fdSPaul Mullowney cusparseStatus_t stat; 217bbf3fe20SPaul Mullowney 218bbf3fe20SPaul Mullowney PetscFunctionBegin; 219bbf3fe20SPaul Mullowney try { 220b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 221b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 222c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 223*17403302SKarl Rupp if (cusparseStruct->stream) { 224c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 225*17403302SKarl Rupp } 226bbf3fe20SPaul Mullowney delete cusparseStruct; 227bbf3fe20SPaul Mullowney } catch(char *ex) { 228bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 229bbf3fe20SPaul Mullowney } 230bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 231bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 232bbf3fe20SPaul Mullowney } 233ca45077fSPaul Mullowney 2348cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 2359ae82921SPaul Mullowney { 2369ae82921SPaul Mullowney PetscErrorCode ierr; 237bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 238bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct; 239b06137fdSPaul Mullowney cusparseStatus_t stat; 2409ae82921SPaul Mullowney 2419ae82921SPaul Mullowney PetscFunctionBegin; 242bbf3fe20SPaul Mullowney ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 243bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 24434136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 24534136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 24634136279SStefano Zampini 247bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 248bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2492205254eSKarl Rupp 250bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 251e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 252e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 253*17403302SKarl Rupp cusparseStruct->stream = 0; 254c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 2552205254eSKarl Rupp 25634d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 257bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 258fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 259bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 260bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 261bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2622205254eSKarl Rupp 263bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 264bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2659ae82921SPaul Mullowney PetscFunctionReturn(0); 2669ae82921SPaul Mullowney } 2679ae82921SPaul Mullowney 2683f9c0db1SPaul Mullowney /*@ 2693f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 270e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2713f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 272e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 273e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 274e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 2759ae82921SPaul Mullowney 276d083f849SBarry Smith Collective 277e057df02SPaul Mullowney 278e057df02SPaul Mullowney Input Parameters: 279e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 280e057df02SPaul Mullowney . m - number of rows 281e057df02SPaul Mullowney . n - number of columns 282e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 283e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 2840298fd71SBarry Smith (possibly different for each row) or NULL 285e057df02SPaul Mullowney 286e057df02SPaul Mullowney Output Parameter: 287e057df02SPaul Mullowney . A - the matrix 288e057df02SPaul Mullowney 289e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 290e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 291e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 292e057df02SPaul Mullowney 293e057df02SPaul Mullowney Notes: 294e057df02SPaul Mullowney If nnz is given then nz is ignored 295e057df02SPaul Mullowney 296e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 297e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 298e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 299e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 300e057df02SPaul Mullowney 301e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 3020298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 303e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 304e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 305e057df02SPaul Mullowney 306e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 307e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 308e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 309e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 310e057df02SPaul Mullowney 311e057df02SPaul Mullowney Level: intermediate 312e057df02SPaul Mullowney 313e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 314e057df02SPaul Mullowney @*/ 3159ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3169ae82921SPaul Mullowney { 3179ae82921SPaul Mullowney PetscErrorCode ierr; 3189ae82921SPaul Mullowney PetscMPIInt size; 3199ae82921SPaul Mullowney 3209ae82921SPaul Mullowney PetscFunctionBegin; 3219ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3229ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3239ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3249ae82921SPaul Mullowney if (size > 1) { 3259ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3269ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3279ae82921SPaul Mullowney } else { 3289ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3299ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3309ae82921SPaul Mullowney } 3319ae82921SPaul Mullowney PetscFunctionReturn(0); 3329ae82921SPaul Mullowney } 3339ae82921SPaul Mullowney 3343ca39a21SBarry Smith /*MC 335e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 336e057df02SPaul Mullowney 3372692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3382692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3392692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3409ae82921SPaul Mullowney 3419ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3429ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3439ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3449ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3459ae82921SPaul Mullowney the above preallocation routines for simplicity. 3469ae82921SPaul Mullowney 3479ae82921SPaul Mullowney Options Database Keys: 348e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3498468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3508468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3518468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3529ae82921SPaul Mullowney 3539ae82921SPaul Mullowney Level: beginner 3549ae82921SPaul Mullowney 3558468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3568468deeeSKarl Rupp M 3579ae82921SPaul Mullowney M*/ 358