1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 20fd2b57fSKarl Rupp 33d13b8fdSMatthew G. Knepley #include <petscconf.h> 49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 53d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 69ae82921SPaul Mullowney 79ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 89ae82921SPaul Mullowney { 9bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 10bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 119ae82921SPaul Mullowney PetscErrorCode ierr; 129ae82921SPaul Mullowney PetscInt i; 139ae82921SPaul Mullowney 149ae82921SPaul Mullowney PetscFunctionBegin; 159ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 169ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 179ae82921SPaul Mullowney if (d_nnz) { 189ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 199ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 209ae82921SPaul Mullowney } 219ae82921SPaul Mullowney } 229ae82921SPaul Mullowney if (o_nnz) { 239ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 249ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 259ae82921SPaul Mullowney } 269ae82921SPaul Mullowney } 279ae82921SPaul Mullowney if (!B->preallocated) { 28bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 299ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 309ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 319ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 323bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 339ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 349ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 359ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 363bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 379ae82921SPaul Mullowney } 389ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 399ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 40e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 41e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 42b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 43b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 44b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 45b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 462205254eSKarl Rupp 479ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 489ae82921SPaul Mullowney PetscFunctionReturn(0); 499ae82921SPaul Mullowney } 50e057df02SPaul Mullowney 519ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 529ae82921SPaul Mullowney { 53e057df02SPaul Mullowney /* This multiplication sequence is different sequence 54e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 55e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 56e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 57e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 58e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 59e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 60e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 61e057df02SPaul Mullowney against race conditions. 62e057df02SPaul Mullowney 63e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 649ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 659ae82921SPaul Mullowney PetscErrorCode ierr; 669ae82921SPaul Mullowney PetscInt nt; 679ae82921SPaul Mullowney 689ae82921SPaul Mullowney PetscFunctionBegin; 699ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 709ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 719ae82921SPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 729ae82921SPaul Mullowney ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 739ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 749ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 759ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 769ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 779ae82921SPaul Mullowney PetscFunctionReturn(0); 789ae82921SPaul Mullowney } 79ca45077fSPaul Mullowney 80ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 81ca45077fSPaul Mullowney { 82e057df02SPaul Mullowney /* This multiplication sequence is different sequence 83e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 84e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 85e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 86e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 87e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 88e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 89e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 90e057df02SPaul Mullowney against race conditions. 91e057df02SPaul Mullowney 92e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 93ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 94ca45077fSPaul Mullowney PetscErrorCode ierr; 95ca45077fSPaul Mullowney PetscInt nt; 96ca45077fSPaul Mullowney 97ca45077fSPaul Mullowney PetscFunctionBegin; 98ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 99ca45077fSPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 1009b2db222SKarl Rupp ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr); 1019b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 102ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 1039b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1049b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 105ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 106ca45077fSPaul Mullowney PetscFunctionReturn(0); 107ca45077fSPaul Mullowney } 1089ae82921SPaul Mullowney 109e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 110ca45077fSPaul Mullowney { 111ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 112bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 113e057df02SPaul Mullowney 114ca45077fSPaul Mullowney PetscFunctionBegin; 115e057df02SPaul Mullowney switch (op) { 116e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 117e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 118045c96e1SPaul Mullowney break; 119e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 120e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 121045c96e1SPaul Mullowney break; 122e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 123e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 124e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 125045c96e1SPaul Mullowney break; 126e057df02SPaul Mullowney default: 127e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 128045c96e1SPaul Mullowney } 129ca45077fSPaul Mullowney PetscFunctionReturn(0); 130ca45077fSPaul Mullowney } 131e057df02SPaul Mullowney 1324416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 133e057df02SPaul Mullowney { 134e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 135e057df02SPaul Mullowney PetscErrorCode ierr; 136e057df02SPaul Mullowney PetscBool flg; 137a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 138a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1395fd66863SKarl Rupp 140e057df02SPaul Mullowney PetscFunctionBegin; 141e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 142e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 143e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 144e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 145a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 146e057df02SPaul Mullowney if (flg) { 147e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 148e057df02SPaul Mullowney } 149e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 150a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 151e057df02SPaul Mullowney if (flg) { 152e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 153e057df02SPaul Mullowney } 154e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 155a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 156e057df02SPaul Mullowney if (flg) { 157e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 158e057df02SPaul Mullowney } 159e057df02SPaul Mullowney } 160e057df02SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 161e057df02SPaul Mullowney PetscFunctionReturn(0); 162e057df02SPaul Mullowney } 163e057df02SPaul Mullowney 16434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 16534d6c7a5SJose E. Roman { 16634d6c7a5SJose E. Roman PetscErrorCode ierr; 16734d6c7a5SJose E. Roman Mat_MPIAIJ *mpiaij; 16834d6c7a5SJose E. Roman 16934d6c7a5SJose E. Roman PetscFunctionBegin; 17034d6c7a5SJose E. Roman mpiaij = (Mat_MPIAIJ*)A->data; 17134d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 17234d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 17334d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 17434d6c7a5SJose E. Roman } 17534d6c7a5SJose E. Roman PetscFunctionReturn(0); 17634d6c7a5SJose E. Roman } 17734d6c7a5SJose E. Roman 178bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 179bbf3fe20SPaul Mullowney { 180bbf3fe20SPaul Mullowney PetscErrorCode ierr; 181bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 182bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 183b06137fdSPaul Mullowney cudaError_t err; 184b06137fdSPaul Mullowney cusparseStatus_t stat; 185bbf3fe20SPaul Mullowney 186bbf3fe20SPaul Mullowney PetscFunctionBegin; 187bbf3fe20SPaul Mullowney try { 188b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 189b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 190c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 191c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 192bbf3fe20SPaul Mullowney delete cusparseStruct; 193bbf3fe20SPaul Mullowney } catch(char *ex) { 194bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 195bbf3fe20SPaul Mullowney } 196bbf3fe20SPaul Mullowney cusparseStruct = 0; 1972205254eSKarl Rupp 198bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 199bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 200bbf3fe20SPaul Mullowney } 201ca45077fSPaul Mullowney 2028cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 2039ae82921SPaul Mullowney { 2049ae82921SPaul Mullowney PetscErrorCode ierr; 205bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 206bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct; 207b06137fdSPaul Mullowney cudaError_t err; 208b06137fdSPaul Mullowney cusparseStatus_t stat; 2099ae82921SPaul Mullowney 2109ae82921SPaul Mullowney PetscFunctionBegin; 211bbf3fe20SPaul Mullowney ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 212bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 213*34136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 214*34136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 215*34136279SStefano Zampini 216bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 217bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2182205254eSKarl Rupp 219bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 220e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 221e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 222c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 223c41cb2e2SAlejandro Lamas Daviña err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 2242205254eSKarl Rupp 22534d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 226bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 227bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 228bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 229bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2302205254eSKarl Rupp 231bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 232bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2339ae82921SPaul Mullowney PetscFunctionReturn(0); 2349ae82921SPaul Mullowney } 2359ae82921SPaul Mullowney 2363f9c0db1SPaul Mullowney /*@ 2373f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 238e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2393f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 240e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 241e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 242e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 2439ae82921SPaul Mullowney 244e057df02SPaul Mullowney Collective on MPI_Comm 245e057df02SPaul Mullowney 246e057df02SPaul Mullowney Input Parameters: 247e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 248e057df02SPaul Mullowney . m - number of rows 249e057df02SPaul Mullowney . n - number of columns 250e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 251e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 2520298fd71SBarry Smith (possibly different for each row) or NULL 253e057df02SPaul Mullowney 254e057df02SPaul Mullowney Output Parameter: 255e057df02SPaul Mullowney . A - the matrix 256e057df02SPaul Mullowney 257e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 258e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 259e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 260e057df02SPaul Mullowney 261e057df02SPaul Mullowney Notes: 262e057df02SPaul Mullowney If nnz is given then nz is ignored 263e057df02SPaul Mullowney 264e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 265e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 266e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 267e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 268e057df02SPaul Mullowney 269e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 2700298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 271e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 272e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 273e057df02SPaul Mullowney 274e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 275e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 276e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 277e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 278e057df02SPaul Mullowney 279e057df02SPaul Mullowney Level: intermediate 280e057df02SPaul Mullowney 281e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 282e057df02SPaul Mullowney @*/ 2839ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 2849ae82921SPaul Mullowney { 2859ae82921SPaul Mullowney PetscErrorCode ierr; 2869ae82921SPaul Mullowney PetscMPIInt size; 2879ae82921SPaul Mullowney 2889ae82921SPaul Mullowney PetscFunctionBegin; 2899ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 2909ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 2919ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 2929ae82921SPaul Mullowney if (size > 1) { 2939ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 2949ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 2959ae82921SPaul Mullowney } else { 2969ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2979ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 2989ae82921SPaul Mullowney } 2999ae82921SPaul Mullowney PetscFunctionReturn(0); 3009ae82921SPaul Mullowney } 3019ae82921SPaul Mullowney 3023ca39a21SBarry Smith /*MC 303e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 304e057df02SPaul Mullowney 3052692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3062692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3072692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3089ae82921SPaul Mullowney 3099ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3109ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3119ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3129ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3139ae82921SPaul Mullowney the above preallocation routines for simplicity. 3149ae82921SPaul Mullowney 3159ae82921SPaul Mullowney Options Database Keys: 316e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3178468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3188468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3198468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3209ae82921SPaul Mullowney 3219ae82921SPaul Mullowney Level: beginner 3229ae82921SPaul Mullowney 3238468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3248468deeeSKarl Rupp M 3259ae82921SPaul Mullowney M*/ 326