1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 35 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 37 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 40 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 42 } 43 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 44 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 57 { 58 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 59 PetscErrorCode ierr; 60 PetscInt nt; 61 62 PetscFunctionBegin; 63 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 64 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 65 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 66 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 67 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 68 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 69 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 70 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 75 { 76 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 77 PetscErrorCode ierr; 78 PetscInt nt; 79 80 PetscFunctionBegin; 81 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 82 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 83 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 84 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 85 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 86 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 88 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 89 PetscFunctionReturn(0); 90 } 91 92 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 93 { 94 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 95 PetscErrorCode ierr; 96 PetscInt nt; 97 98 PetscFunctionBegin; 99 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 100 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 101 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 102 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 103 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 104 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 105 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 106 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 107 PetscFunctionReturn(0); 108 } 109 110 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 111 { 112 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 113 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 114 115 PetscFunctionBegin; 116 switch (op) { 117 case MAT_CUSPARSE_MULT_DIAG: 118 cusparseStruct->diagGPUMatFormat = format; 119 break; 120 case MAT_CUSPARSE_MULT_OFFDIAG: 121 cusparseStruct->offdiagGPUMatFormat = format; 122 break; 123 case MAT_CUSPARSE_ALL: 124 cusparseStruct->diagGPUMatFormat = format; 125 cusparseStruct->offdiagGPUMatFormat = format; 126 break; 127 default: 128 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 129 } 130 PetscFunctionReturn(0); 131 } 132 133 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 134 { 135 MatCUSPARSEStorageFormat format; 136 PetscErrorCode ierr; 137 PetscBool flg; 138 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 139 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 140 141 PetscFunctionBegin; 142 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 143 if (A->factortype==MAT_FACTOR_NONE) { 144 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 145 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 146 if (flg) { 147 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 148 } 149 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 150 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 151 if (flg) { 152 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 153 } 154 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 155 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 156 if (flg) { 157 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 158 } 159 } 160 ierr = PetscOptionsTail();CHKERRQ(ierr); 161 PetscFunctionReturn(0); 162 } 163 164 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 165 { 166 PetscErrorCode ierr; 167 Mat_MPIAIJ *mpiaij; 168 169 PetscFunctionBegin; 170 mpiaij = (Mat_MPIAIJ*)A->data; 171 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 172 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 173 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 174 } 175 PetscFunctionReturn(0); 176 } 177 178 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 179 { 180 PetscErrorCode ierr; 181 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 182 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 183 cudaError_t err; 184 cusparseStatus_t stat; 185 186 PetscFunctionBegin; 187 try { 188 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 189 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 190 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 191 if (cusparseStruct->stream) { 192 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 193 } 194 delete cusparseStruct; 195 } catch(char *ex) { 196 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 197 } 198 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 199 PetscFunctionReturn(0); 200 } 201 202 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 203 { 204 PetscErrorCode ierr; 205 Mat_MPIAIJ *a; 206 Mat_MPIAIJCUSPARSE * cusparseStruct; 207 cusparseStatus_t stat; 208 209 PetscFunctionBegin; 210 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 211 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 212 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 213 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 214 215 a = (Mat_MPIAIJ*)A->data; 216 a->spptr = new Mat_MPIAIJCUSPARSE; 217 218 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 219 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 220 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 221 cusparseStruct->stream = 0; 222 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 223 224 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 225 A->ops->mult = MatMult_MPIAIJCUSPARSE; 226 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 227 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 228 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 229 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 230 231 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 232 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 233 PetscFunctionReturn(0); 234 } 235 236 /*@ 237 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 238 (the default parallel PETSc format). This matrix will ultimately pushed down 239 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 240 assembly performance the user should preallocate the matrix storage by setting 241 the parameter nz (or the array nnz). By setting these parameters accurately, 242 performance during matrix assembly can be increased by more than a factor of 50. 243 244 Collective 245 246 Input Parameters: 247 + comm - MPI communicator, set to PETSC_COMM_SELF 248 . m - number of rows 249 . n - number of columns 250 . nz - number of nonzeros per row (same for all rows) 251 - nnz - array containing the number of nonzeros in the various rows 252 (possibly different for each row) or NULL 253 254 Output Parameter: 255 . A - the matrix 256 257 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 258 MatXXXXSetPreallocation() paradigm instead of this routine directly. 259 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 260 261 Notes: 262 If nnz is given then nz is ignored 263 264 The AIJ format (also called the Yale sparse matrix format or 265 compressed row storage), is fully compatible with standard Fortran 77 266 storage. That is, the stored row and column indices can begin at 267 either one (as in Fortran) or zero. See the users' manual for details. 268 269 Specify the preallocated storage with either nz or nnz (not both). 270 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 271 allocation. For large problems you MUST preallocate memory or you 272 will get TERRIBLE performance, see the users' manual chapter on matrices. 273 274 By default, this format uses inodes (identical nodes) when possible, to 275 improve numerical efficiency of matrix-vector products and solves. We 276 search for consecutive rows with the same nonzero structure, thereby 277 reusing matrix information to achieve increased efficiency. 278 279 Level: intermediate 280 281 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 282 @*/ 283 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 284 { 285 PetscErrorCode ierr; 286 PetscMPIInt size; 287 288 PetscFunctionBegin; 289 ierr = MatCreate(comm,A);CHKERRQ(ierr); 290 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 291 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 292 if (size > 1) { 293 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 294 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 295 } else { 296 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 297 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 298 } 299 PetscFunctionReturn(0); 300 } 301 302 /*MC 303 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 304 305 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 306 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 307 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 308 309 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 310 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 311 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 312 for communicators controlling multiple processes. It is recommended that you call both of 313 the above preallocation routines for simplicity. 314 315 Options Database Keys: 316 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 317 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 318 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 319 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 320 321 Level: beginner 322 323 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 324 M 325 M*/ 326