1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 40fd2b57fSKarl Rupp 53d13b8fdSMatthew G. Knepley #include <petscconf.h> 69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 99ae82921SPaul Mullowney 109ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 119ae82921SPaul Mullowney { 12bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 149ae82921SPaul Mullowney PetscErrorCode ierr; 159ae82921SPaul Mullowney PetscInt i; 169ae82921SPaul Mullowney 179ae82921SPaul Mullowney PetscFunctionBegin; 189ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 199ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 209ae82921SPaul Mullowney if (d_nnz) { 219ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 229ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239ae82921SPaul Mullowney } 249ae82921SPaul Mullowney } 259ae82921SPaul Mullowney if (o_nnz) { 269ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 279ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 289ae82921SPaul Mullowney } 299ae82921SPaul Mullowney } 309ae82921SPaul Mullowney if (!B->preallocated) { 31bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 329ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 349ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 359ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 363bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 379ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 399ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 409ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 413bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 429ae82921SPaul Mullowney } 439ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 449ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 512205254eSKarl Rupp 529ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 539ae82921SPaul Mullowney PetscFunctionReturn(0); 549ae82921SPaul Mullowney } 55e057df02SPaul Mullowney 569ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 579ae82921SPaul Mullowney { 589ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 599ae82921SPaul Mullowney PetscErrorCode ierr; 609ae82921SPaul Mullowney PetscInt nt; 619ae82921SPaul Mullowney 629ae82921SPaul Mullowney PetscFunctionBegin; 639ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 649ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 65959dcdf5SJunchao Zhang ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 669ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 674d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 689ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 699ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 709ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 719ae82921SPaul Mullowney PetscFunctionReturn(0); 729ae82921SPaul Mullowney } 73ca45077fSPaul Mullowney 74fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 75fdc842d1SBarry Smith { 76fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 77fdc842d1SBarry Smith PetscErrorCode ierr; 78fdc842d1SBarry Smith PetscInt nt; 79fdc842d1SBarry Smith 80fdc842d1SBarry Smith PetscFunctionBegin; 81fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 82fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 83fdc842d1SBarry Smith ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 84fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 854d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 86fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 88fdc842d1SBarry Smith ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 89fdc842d1SBarry Smith PetscFunctionReturn(0); 90fdc842d1SBarry Smith } 91fdc842d1SBarry Smith 92ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 93ca45077fSPaul Mullowney { 94ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 95ca45077fSPaul Mullowney PetscErrorCode ierr; 96ca45077fSPaul Mullowney PetscInt nt; 97ca45077fSPaul Mullowney 98ca45077fSPaul Mullowney PetscFunctionBegin; 99ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 100ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 101a3fdcf43SKarl Rupp ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 1029b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 103ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 1049b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1059b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 106ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 107ca45077fSPaul Mullowney PetscFunctionReturn(0); 108ca45077fSPaul Mullowney } 1099ae82921SPaul Mullowney 110e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 111ca45077fSPaul Mullowney { 112ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 113bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 114e057df02SPaul Mullowney 115ca45077fSPaul Mullowney PetscFunctionBegin; 116e057df02SPaul Mullowney switch (op) { 117e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 118e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 119045c96e1SPaul Mullowney break; 120e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 121e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 122045c96e1SPaul Mullowney break; 123e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 124e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 125e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 126045c96e1SPaul Mullowney break; 127e057df02SPaul Mullowney default: 128e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 129045c96e1SPaul Mullowney } 130ca45077fSPaul Mullowney PetscFunctionReturn(0); 131ca45077fSPaul Mullowney } 132e057df02SPaul Mullowney 1334416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 134e057df02SPaul Mullowney { 135e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 136e057df02SPaul Mullowney PetscErrorCode ierr; 137e057df02SPaul Mullowney PetscBool flg; 138a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 139a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1405fd66863SKarl Rupp 141e057df02SPaul Mullowney PetscFunctionBegin; 142e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 143e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 144e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 145a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 146e057df02SPaul Mullowney if (flg) { 147e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 148e057df02SPaul Mullowney } 149e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 150a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 151e057df02SPaul Mullowney if (flg) { 152e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 153e057df02SPaul Mullowney } 154e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 155a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 156e057df02SPaul Mullowney if (flg) { 157e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 158e057df02SPaul Mullowney } 159e057df02SPaul Mullowney } 1600af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 161e057df02SPaul Mullowney PetscFunctionReturn(0); 162e057df02SPaul Mullowney } 163e057df02SPaul Mullowney 16434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 16534d6c7a5SJose E. Roman { 16634d6c7a5SJose E. Roman PetscErrorCode ierr; 16734d6c7a5SJose E. Roman Mat_MPIAIJ *mpiaij; 16834d6c7a5SJose E. Roman 16934d6c7a5SJose E. Roman PetscFunctionBegin; 17034d6c7a5SJose E. Roman mpiaij = (Mat_MPIAIJ*)A->data; 17134d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 17234d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 17334d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 17434d6c7a5SJose E. Roman } 17534d6c7a5SJose E. Roman PetscFunctionReturn(0); 17634d6c7a5SJose E. Roman } 17734d6c7a5SJose E. Roman 178bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 179bbf3fe20SPaul Mullowney { 180bbf3fe20SPaul Mullowney PetscErrorCode ierr; 181bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 182bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 183b06137fdSPaul Mullowney cudaError_t err; 184b06137fdSPaul Mullowney cusparseStatus_t stat; 185bbf3fe20SPaul Mullowney 186bbf3fe20SPaul Mullowney PetscFunctionBegin; 187bbf3fe20SPaul Mullowney try { 1884b2d9054SStefano Zampini if (a->A) { ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); } 1894b2d9054SStefano Zampini if (a->B) { ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); } 19057d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 19117403302SKarl Rupp if (cusparseStruct->stream) { 192c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 19317403302SKarl Rupp } 194bbf3fe20SPaul Mullowney delete cusparseStruct; 195bbf3fe20SPaul Mullowney } catch(char *ex) { 196bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 197bbf3fe20SPaul Mullowney } 198*3338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 199*3338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 200bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 201bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 202bbf3fe20SPaul Mullowney } 203ca45077fSPaul Mullowney 204*3338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 2059ae82921SPaul Mullowney { 2069ae82921SPaul Mullowney PetscErrorCode ierr; 207bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 208bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 209b06137fdSPaul Mullowney cusparseStatus_t stat; 210*3338378cSStefano Zampini Mat A; 2119ae82921SPaul Mullowney 2129ae82921SPaul Mullowney PetscFunctionBegin; 213*3338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 214*3338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 215*3338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 216*3338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 217*3338378cSStefano Zampini } 218*3338378cSStefano Zampini A = *newmat; 219*3338378cSStefano Zampini 22034136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 22134136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 22234136279SStefano Zampini 223bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 224*3338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 225bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2262205254eSKarl Rupp 227bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 228e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 229e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 23017403302SKarl Rupp cusparseStruct->stream = 0; 23157d48284SJunchao Zhang stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 232*3338378cSStefano Zampini } 2332205254eSKarl Rupp 23434d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 235bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 236fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 237bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 238bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 239bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2402205254eSKarl Rupp 241bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 242*3338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 243bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2449ae82921SPaul Mullowney PetscFunctionReturn(0); 2459ae82921SPaul Mullowney } 2469ae82921SPaul Mullowney 247*3338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 248*3338378cSStefano Zampini { 249*3338378cSStefano Zampini PetscErrorCode ierr; 250*3338378cSStefano Zampini 251*3338378cSStefano Zampini PetscFunctionBegin; 252*3338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 253*3338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 254*3338378cSStefano Zampini PetscFunctionReturn(0); 255*3338378cSStefano Zampini } 256*3338378cSStefano Zampini 2573f9c0db1SPaul Mullowney /*@ 2583f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 259e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2603f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 261e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 262e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 263e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 2649ae82921SPaul Mullowney 265d083f849SBarry Smith Collective 266e057df02SPaul Mullowney 267e057df02SPaul Mullowney Input Parameters: 268e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 269e057df02SPaul Mullowney . m - number of rows 270e057df02SPaul Mullowney . n - number of columns 271e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 272e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 2730298fd71SBarry Smith (possibly different for each row) or NULL 274e057df02SPaul Mullowney 275e057df02SPaul Mullowney Output Parameter: 276e057df02SPaul Mullowney . A - the matrix 277e057df02SPaul Mullowney 278e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 279e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 280e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 281e057df02SPaul Mullowney 282e057df02SPaul Mullowney Notes: 283e057df02SPaul Mullowney If nnz is given then nz is ignored 284e057df02SPaul Mullowney 285e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 286e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 287e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 288e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 289e057df02SPaul Mullowney 290e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 2910298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 292e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 293e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 294e057df02SPaul Mullowney 295e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 296e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 297e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 298e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 299e057df02SPaul Mullowney 300e057df02SPaul Mullowney Level: intermediate 301e057df02SPaul Mullowney 302e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 303e057df02SPaul Mullowney @*/ 3049ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3059ae82921SPaul Mullowney { 3069ae82921SPaul Mullowney PetscErrorCode ierr; 3079ae82921SPaul Mullowney PetscMPIInt size; 3089ae82921SPaul Mullowney 3099ae82921SPaul Mullowney PetscFunctionBegin; 3109ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3119ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3129ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3139ae82921SPaul Mullowney if (size > 1) { 3149ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3159ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3169ae82921SPaul Mullowney } else { 3179ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3189ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3199ae82921SPaul Mullowney } 3209ae82921SPaul Mullowney PetscFunctionReturn(0); 3219ae82921SPaul Mullowney } 3229ae82921SPaul Mullowney 3233ca39a21SBarry Smith /*MC 324e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 325e057df02SPaul Mullowney 3262692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3272692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3282692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3299ae82921SPaul Mullowney 3309ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3319ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3329ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3339ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3349ae82921SPaul Mullowney the above preallocation routines for simplicity. 3359ae82921SPaul Mullowney 3369ae82921SPaul Mullowney Options Database Keys: 337e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3388468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3398468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3408468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3419ae82921SPaul Mullowney 3429ae82921SPaul Mullowney Level: beginner 3439ae82921SPaul Mullowney 3448468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3458468deeeSKarl Rupp M 3469ae82921SPaul Mullowney M*/ 347