1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 19 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 20 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 21 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 22 23 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 25 if (d_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 28 } 29 } 30 if (o_nnz) { 31 for (i=0; i<B->rmap->n; i++) { 32 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 33 } 34 } 35 if (!B->preallocated) { 36 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 37 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 38 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 39 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 40 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 41 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 42 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 43 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 44 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 45 } 46 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 47 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 53 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 54 55 B->preallocated = PETSC_TRUE; 56 PetscFunctionReturn(0); 57 } 58 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE" 61 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 62 { 63 PetscErrorCode ierr; 64 PetscInt rbs,cbs; 65 66 PetscFunctionBegin; 67 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 68 if (right) { 69 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 70 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 71 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 72 ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr); 73 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 74 } 75 if (left) { 76 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 77 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 78 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 79 ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr); 80 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 81 82 83 } 84 PetscFunctionReturn(0); 85 } 86 87 88 #undef __FUNCT__ 89 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 90 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 91 { 92 /* This multiplication sequence is different sequence 93 than the CPU version. In particular, the diagonal block 94 multiplication kernel is launched in one stream. Then, 95 in a separate stream, the data transfers from DeviceToHost 96 (with MPI messaging in between), then HostToDevice are 97 launched. Once the data transfer stream is synchronized, 98 to ensure messaging is complete, the MatMultAdd kernel 99 is launched in the original (MatMult) stream to protect 100 against race conditions. 101 102 This sequence should only be called for GPU computation. */ 103 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 104 PetscErrorCode ierr; 105 PetscInt nt; 106 107 PetscFunctionBegin; 108 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 109 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 110 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 111 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 112 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 113 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 114 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 115 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 119 #undef __FUNCT__ 120 #define __FUNCT__ "MatMultTranspose_MPIAIJCUSPARSE" 121 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 122 { 123 /* This multiplication sequence is different sequence 124 than the CPU version. In particular, the diagonal block 125 multiplication kernel is launched in one stream. Then, 126 in a separate stream, the data transfers from DeviceToHost 127 (with MPI messaging in between), then HostToDevice are 128 launched. Once the data transfer stream is synchronized, 129 to ensure messaging is complete, the MatMultAdd kernel 130 is launched in the original (MatMult) stream to protect 131 against race conditions. 132 133 This sequence should only be called for GPU computation. */ 134 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 135 PetscErrorCode ierr; 136 PetscInt nt; 137 138 PetscFunctionBegin; 139 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 140 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 141 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 142 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 143 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 144 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 145 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 146 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 147 PetscFunctionReturn(0); 148 } 149 150 #undef __FUNCT__ 151 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 152 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 153 { 154 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 155 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 156 157 PetscFunctionBegin; 158 switch (op) { 159 case MAT_CUSPARSE_MULT_DIAG: 160 cusparseStruct->diagGPUMatFormat = format; 161 break; 162 case MAT_CUSPARSE_MULT_OFFDIAG: 163 cusparseStruct->offdiagGPUMatFormat = format; 164 break; 165 case MAT_CUSPARSE_ALL: 166 cusparseStruct->diagGPUMatFormat = format; 167 cusparseStruct->offdiagGPUMatFormat = format; 168 break; 169 default: 170 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 171 } 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 177 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 178 { 179 MatCUSPARSEStorageFormat format; 180 PetscErrorCode ierr; 181 PetscBool flg; 182 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 183 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 184 185 PetscFunctionBegin; 186 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 187 ierr = PetscObjectOptionsBegin((PetscObject)A); 188 if (A->factortype==MAT_FACTOR_NONE) { 189 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 190 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 191 if (flg) { 192 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 193 } 194 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 195 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196 if (flg) { 197 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 198 } 199 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 200 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 201 if (flg) { 202 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 203 } 204 } 205 ierr = PetscOptionsEnd();CHKERRQ(ierr); 206 PetscFunctionReturn(0); 207 } 208 209 #undef __FUNCT__ 210 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 211 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 212 { 213 PetscErrorCode ierr; 214 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 215 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 216 cudaError_t err; 217 cusparseStatus_t stat; 218 219 PetscFunctionBegin; 220 try { 221 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 222 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 223 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 224 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 225 delete cusparseStruct; 226 } catch(char *ex) { 227 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 228 } 229 cusparseStruct = 0; 230 231 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 237 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 238 { 239 PetscErrorCode ierr; 240 Mat_MPIAIJ *a; 241 Mat_MPIAIJCUSPARSE * cusparseStruct; 242 cudaError_t err; 243 cusparseStatus_t stat; 244 245 PetscFunctionBegin; 246 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 247 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 248 a = (Mat_MPIAIJ*)A->data; 249 a->spptr = new Mat_MPIAIJCUSPARSE; 250 251 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 252 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 253 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 254 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 255 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 256 257 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 258 A->ops->mult = MatMult_MPIAIJCUSPARSE; 259 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 260 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 261 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 262 263 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 264 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 265 PetscFunctionReturn(0); 266 } 267 268 /*@ 269 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 270 (the default parallel PETSc format). This matrix will ultimately pushed down 271 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 272 assembly performance the user should preallocate the matrix storage by setting 273 the parameter nz (or the array nnz). By setting these parameters accurately, 274 performance during matrix assembly can be increased by more than a factor of 50. 275 276 Collective on MPI_Comm 277 278 Input Parameters: 279 + comm - MPI communicator, set to PETSC_COMM_SELF 280 . m - number of rows 281 . n - number of columns 282 . nz - number of nonzeros per row (same for all rows) 283 - nnz - array containing the number of nonzeros in the various rows 284 (possibly different for each row) or NULL 285 286 Output Parameter: 287 . A - the matrix 288 289 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 290 MatXXXXSetPreallocation() paradigm instead of this routine directly. 291 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 292 293 Notes: 294 If nnz is given then nz is ignored 295 296 The AIJ format (also called the Yale sparse matrix format or 297 compressed row storage), is fully compatible with standard Fortran 77 298 storage. That is, the stored row and column indices can begin at 299 either one (as in Fortran) or zero. See the users' manual for details. 300 301 Specify the preallocated storage with either nz or nnz (not both). 302 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 303 allocation. For large problems you MUST preallocate memory or you 304 will get TERRIBLE performance, see the users' manual chapter on matrices. 305 306 By default, this format uses inodes (identical nodes) when possible, to 307 improve numerical efficiency of matrix-vector products and solves. We 308 search for consecutive rows with the same nonzero structure, thereby 309 reusing matrix information to achieve increased efficiency. 310 311 Level: intermediate 312 313 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 314 @*/ 315 #undef __FUNCT__ 316 #define __FUNCT__ "MatCreateAIJCUSPARSE" 317 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 318 { 319 PetscErrorCode ierr; 320 PetscMPIInt size; 321 322 PetscFunctionBegin; 323 ierr = MatCreate(comm,A);CHKERRQ(ierr); 324 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 325 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 326 if (size > 1) { 327 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 328 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 329 } else { 330 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 331 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 332 } 333 PetscFunctionReturn(0); 334 } 335 336 /*M 337 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 338 339 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 340 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 341 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 342 343 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 344 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 345 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 346 for communicators controlling multiple processes. It is recommended that you call both of 347 the above preallocation routines for simplicity. 348 349 Options Database Keys: 350 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 351 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 352 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 353 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 354 355 Level: beginner 356 357 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 358 M 359 M*/ 360