1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 34 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 35 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 36 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 37 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 38 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 39 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 40 } 41 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 42 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 49 50 B->preallocated = PETSC_TRUE; 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE" 56 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 57 { 58 PetscErrorCode ierr; 59 PetscInt rbs,cbs; 60 61 PetscFunctionBegin; 62 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 63 if (right) { 64 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 65 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 66 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 67 ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr); 68 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 69 } 70 if (left) { 71 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 72 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 73 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 74 ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr); 75 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 76 77 78 } 79 PetscFunctionReturn(0); 80 } 81 82 83 #undef __FUNCT__ 84 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 85 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 86 { 87 /* This multiplication sequence is different sequence 88 than the CPU version. In particular, the diagonal block 89 multiplication kernel is launched in one stream. Then, 90 in a separate stream, the data transfers from DeviceToHost 91 (with MPI messaging in between), then HostToDevice are 92 launched. Once the data transfer stream is synchronized, 93 to ensure messaging is complete, the MatMultAdd kernel 94 is launched in the original (MatMult) stream to protect 95 against race conditions. 96 97 This sequence should only be called for GPU computation. */ 98 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 99 PetscErrorCode ierr; 100 PetscInt nt; 101 102 PetscFunctionBegin; 103 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 104 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 105 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 107 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 108 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 109 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 110 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 111 PetscFunctionReturn(0); 112 } 113 114 #undef __FUNCT__ 115 #define __FUNCT__ "MatMultTranspose_MPIAIJCUSPARSE" 116 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 117 { 118 /* This multiplication sequence is different sequence 119 than the CPU version. In particular, the diagonal block 120 multiplication kernel is launched in one stream. Then, 121 in a separate stream, the data transfers from DeviceToHost 122 (with MPI messaging in between), then HostToDevice are 123 launched. Once the data transfer stream is synchronized, 124 to ensure messaging is complete, the MatMultAdd kernel 125 is launched in the original (MatMult) stream to protect 126 against race conditions. 127 128 This sequence should only be called for GPU computation. */ 129 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 130 PetscErrorCode ierr; 131 PetscInt nt; 132 133 PetscFunctionBegin; 134 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 135 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 136 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 137 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 138 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 139 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 140 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 141 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 142 PetscFunctionReturn(0); 143 } 144 145 #undef __FUNCT__ 146 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 147 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 148 { 149 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 150 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 151 152 PetscFunctionBegin; 153 switch (op) { 154 case MAT_CUSPARSE_MULT_DIAG: 155 cusparseStruct->diagGPUMatFormat = format; 156 break; 157 case MAT_CUSPARSE_MULT_OFFDIAG: 158 cusparseStruct->offdiagGPUMatFormat = format; 159 break; 160 case MAT_CUSPARSE_ALL: 161 cusparseStruct->diagGPUMatFormat = format; 162 cusparseStruct->offdiagGPUMatFormat = format; 163 break; 164 default: 165 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 166 } 167 PetscFunctionReturn(0); 168 } 169 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 172 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 173 { 174 MatCUSPARSEStorageFormat format; 175 PetscErrorCode ierr; 176 PetscBool flg; 177 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 178 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 179 180 PetscFunctionBegin; 181 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 182 ierr = PetscObjectOptionsBegin((PetscObject)A); 183 if (A->factortype==MAT_FACTOR_NONE) { 184 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 185 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 186 if (flg) { 187 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 188 } 189 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 190 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 191 if (flg) { 192 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 193 } 194 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 195 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196 if (flg) { 197 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 198 } 199 } 200 ierr = PetscOptionsEnd();CHKERRQ(ierr); 201 PetscFunctionReturn(0); 202 } 203 204 #undef __FUNCT__ 205 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 206 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 207 { 208 PetscErrorCode ierr; 209 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 210 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 211 cudaError_t err; 212 cusparseStatus_t stat; 213 214 PetscFunctionBegin; 215 try { 216 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 217 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 218 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 219 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 220 delete cusparseStruct; 221 } catch(char *ex) { 222 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 223 } 224 cusparseStruct = 0; 225 226 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 #undef __FUNCT__ 231 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 232 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 233 { 234 PetscErrorCode ierr; 235 Mat_MPIAIJ *a; 236 Mat_MPIAIJCUSPARSE * cusparseStruct; 237 cudaError_t err; 238 cusparseStatus_t stat; 239 240 PetscFunctionBegin; 241 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 242 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 243 a = (Mat_MPIAIJ*)A->data; 244 a->spptr = new Mat_MPIAIJCUSPARSE; 245 246 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 247 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 248 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 249 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 250 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 251 252 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 253 A->ops->mult = MatMult_MPIAIJCUSPARSE; 254 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 255 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 256 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 257 258 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 259 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 260 PetscFunctionReturn(0); 261 } 262 263 /*@ 264 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 265 (the default parallel PETSc format). This matrix will ultimately pushed down 266 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 267 assembly performance the user should preallocate the matrix storage by setting 268 the parameter nz (or the array nnz). By setting these parameters accurately, 269 performance during matrix assembly can be increased by more than a factor of 50. 270 271 Collective on MPI_Comm 272 273 Input Parameters: 274 + comm - MPI communicator, set to PETSC_COMM_SELF 275 . m - number of rows 276 . n - number of columns 277 . nz - number of nonzeros per row (same for all rows) 278 - nnz - array containing the number of nonzeros in the various rows 279 (possibly different for each row) or NULL 280 281 Output Parameter: 282 . A - the matrix 283 284 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 285 MatXXXXSetPreallocation() paradigm instead of this routine directly. 286 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 287 288 Notes: 289 If nnz is given then nz is ignored 290 291 The AIJ format (also called the Yale sparse matrix format or 292 compressed row storage), is fully compatible with standard Fortran 77 293 storage. That is, the stored row and column indices can begin at 294 either one (as in Fortran) or zero. See the users' manual for details. 295 296 Specify the preallocated storage with either nz or nnz (not both). 297 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 298 allocation. For large problems you MUST preallocate memory or you 299 will get TERRIBLE performance, see the users' manual chapter on matrices. 300 301 By default, this format uses inodes (identical nodes) when possible, to 302 improve numerical efficiency of matrix-vector products and solves. We 303 search for consecutive rows with the same nonzero structure, thereby 304 reusing matrix information to achieve increased efficiency. 305 306 Level: intermediate 307 308 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 309 @*/ 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatCreateAIJCUSPARSE" 312 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 313 { 314 PetscErrorCode ierr; 315 PetscMPIInt size; 316 317 PetscFunctionBegin; 318 ierr = MatCreate(comm,A);CHKERRQ(ierr); 319 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 320 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 321 if (size > 1) { 322 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 323 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 324 } else { 325 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 326 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 327 } 328 PetscFunctionReturn(0); 329 } 330 331 /*M 332 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 333 334 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 335 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 336 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 337 338 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 339 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 340 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 341 for communicators controlling multiple processes. It is recommended that you call both of 342 the above preallocation routines for simplicity. 343 344 Options Database Keys: 345 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 346 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 347 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 348 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 349 350 Level: beginner 351 352 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 353 M 354 M*/ 355