1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 34 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 35 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 36 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 37 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 38 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 39 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 40 } 41 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 42 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 49 50 B->preallocated = PETSC_TRUE; 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE" 56 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 57 { 58 PetscErrorCode ierr; 59 PetscInt rbs,cbs; 60 61 PetscFunctionBegin; 62 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 63 if (right) { 64 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 65 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 66 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 67 ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr); 68 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 69 } 70 if (left) { 71 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 72 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 73 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 74 ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr); 75 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 76 77 78 } 79 PetscFunctionReturn(0); 80 } 81 82 #undef __FUNCT__ 83 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 84 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 85 { 86 /* This multiplication sequence is different sequence 87 than the CPU version. In particular, the diagonal block 88 multiplication kernel is launched in one stream. Then, 89 in a separate stream, the data transfers from DeviceToHost 90 (with MPI messaging in between), then HostToDevice are 91 launched. Once the data transfer stream is synchronized, 92 to ensure messaging is complete, the MatMultAdd kernel 93 is launched in the original (MatMult) stream to protect 94 against race conditions. 95 96 This sequence should only be called for GPU computation. */ 97 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 98 PetscErrorCode ierr; 99 PetscInt nt; 100 101 PetscFunctionBegin; 102 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 103 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 104 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 106 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 107 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 108 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 109 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 113 #undef __FUNCT__ 114 #define __FUNCT__ "MatMultTranspose_MPIAIJCUSPARSE" 115 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 116 { 117 /* This multiplication sequence is different sequence 118 than the CPU version. In particular, the diagonal block 119 multiplication kernel is launched in one stream. Then, 120 in a separate stream, the data transfers from DeviceToHost 121 (with MPI messaging in between), then HostToDevice are 122 launched. Once the data transfer stream is synchronized, 123 to ensure messaging is complete, the MatMultAdd kernel 124 is launched in the original (MatMult) stream to protect 125 against race conditions. 126 127 This sequence should only be called for GPU computation. */ 128 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 129 PetscErrorCode ierr; 130 PetscInt nt; 131 132 PetscFunctionBegin; 133 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 134 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 135 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 136 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 137 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 138 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 139 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 140 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 141 PetscFunctionReturn(0); 142 } 143 144 #undef __FUNCT__ 145 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 146 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 147 { 148 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 149 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 150 151 PetscFunctionBegin; 152 switch (op) { 153 case MAT_CUSPARSE_MULT_DIAG: 154 cusparseStruct->diagGPUMatFormat = format; 155 break; 156 case MAT_CUSPARSE_MULT_OFFDIAG: 157 cusparseStruct->offdiagGPUMatFormat = format; 158 break; 159 case MAT_CUSPARSE_ALL: 160 cusparseStruct->diagGPUMatFormat = format; 161 cusparseStruct->offdiagGPUMatFormat = format; 162 break; 163 default: 164 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 165 } 166 PetscFunctionReturn(0); 167 } 168 169 #undef __FUNCT__ 170 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 171 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 172 { 173 MatCUSPARSEStorageFormat format; 174 PetscErrorCode ierr; 175 PetscBool flg; 176 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 177 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 178 179 PetscFunctionBegin; 180 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 181 ierr = PetscObjectOptionsBegin((PetscObject)A); 182 if (A->factortype==MAT_FACTOR_NONE) { 183 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 184 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185 if (flg) { 186 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 187 } 188 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 192 } 193 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 194 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 195 if (flg) { 196 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 197 } 198 } 199 ierr = PetscOptionsEnd();CHKERRQ(ierr); 200 PetscFunctionReturn(0); 201 } 202 203 PETSC_EXTERN PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat,MatAssemblyType); 204 205 #undef __FUNCT__ 206 #define __FUNCT__ "MatAssemblyEnd_MPIAIJCUSPARSE" 207 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 208 { 209 PetscErrorCode ierr; 210 Mat_MPIAIJ *mpiaij; 211 212 PetscFunctionBegin; 213 mpiaij = (Mat_MPIAIJ*)A->data; 214 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 215 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 216 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 217 } 218 PetscFunctionReturn(0); 219 } 220 221 #undef __FUNCT__ 222 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 223 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 224 { 225 PetscErrorCode ierr; 226 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 227 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 228 cudaError_t err; 229 cusparseStatus_t stat; 230 231 PetscFunctionBegin; 232 try { 233 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 234 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 235 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 236 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 237 delete cusparseStruct; 238 } catch(char *ex) { 239 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 240 } 241 cusparseStruct = 0; 242 243 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 #undef __FUNCT__ 248 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 249 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 250 { 251 PetscErrorCode ierr; 252 Mat_MPIAIJ *a; 253 Mat_MPIAIJCUSPARSE * cusparseStruct; 254 cudaError_t err; 255 cusparseStatus_t stat; 256 257 PetscFunctionBegin; 258 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 259 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 260 a = (Mat_MPIAIJ*)A->data; 261 a->spptr = new Mat_MPIAIJCUSPARSE; 262 263 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 264 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 265 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 266 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 267 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 268 269 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 270 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 271 A->ops->mult = MatMult_MPIAIJCUSPARSE; 272 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 273 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 274 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 275 276 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 277 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 /*@ 282 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 283 (the default parallel PETSc format). This matrix will ultimately pushed down 284 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 285 assembly performance the user should preallocate the matrix storage by setting 286 the parameter nz (or the array nnz). By setting these parameters accurately, 287 performance during matrix assembly can be increased by more than a factor of 50. 288 289 Collective on MPI_Comm 290 291 Input Parameters: 292 + comm - MPI communicator, set to PETSC_COMM_SELF 293 . m - number of rows 294 . n - number of columns 295 . nz - number of nonzeros per row (same for all rows) 296 - nnz - array containing the number of nonzeros in the various rows 297 (possibly different for each row) or NULL 298 299 Output Parameter: 300 . A - the matrix 301 302 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 303 MatXXXXSetPreallocation() paradigm instead of this routine directly. 304 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 305 306 Notes: 307 If nnz is given then nz is ignored 308 309 The AIJ format (also called the Yale sparse matrix format or 310 compressed row storage), is fully compatible with standard Fortran 77 311 storage. That is, the stored row and column indices can begin at 312 either one (as in Fortran) or zero. See the users' manual for details. 313 314 Specify the preallocated storage with either nz or nnz (not both). 315 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 316 allocation. For large problems you MUST preallocate memory or you 317 will get TERRIBLE performance, see the users' manual chapter on matrices. 318 319 By default, this format uses inodes (identical nodes) when possible, to 320 improve numerical efficiency of matrix-vector products and solves. We 321 search for consecutive rows with the same nonzero structure, thereby 322 reusing matrix information to achieve increased efficiency. 323 324 Level: intermediate 325 326 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 327 @*/ 328 #undef __FUNCT__ 329 #define __FUNCT__ "MatCreateAIJCUSPARSE" 330 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 331 { 332 PetscErrorCode ierr; 333 PetscMPIInt size; 334 335 PetscFunctionBegin; 336 ierr = MatCreate(comm,A);CHKERRQ(ierr); 337 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 338 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 339 if (size > 1) { 340 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 341 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 342 } else { 343 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 344 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 345 } 346 PetscFunctionReturn(0); 347 } 348 349 /*M 350 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 351 352 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 353 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 354 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 355 356 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 357 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 358 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 359 for communicators controlling multiple processes. It is recommended that you call both of 360 the above preallocation routines for simplicity. 361 362 Options Database Keys: 363 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 364 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 365 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 366 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 367 368 Level: beginner 369 370 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 371 M 372 M*/ 373