1 #include <petscconf.h> 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 19 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 20 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 21 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 22 23 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 25 if (d_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 28 } 29 } 30 if (o_nnz) { 31 for (i=0; i<B->rmap->n; i++) { 32 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 33 } 34 } 35 if (!B->preallocated) { 36 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 37 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 38 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 39 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 40 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 41 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 42 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 43 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 44 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 45 } 46 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 47 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 53 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 54 55 B->preallocated = PETSC_TRUE; 56 PetscFunctionReturn(0); 57 } 58 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 61 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 62 { 63 PetscErrorCode ierr; 64 PetscInt rbs,cbs; 65 66 PetscFunctionBegin; 67 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 68 if (right) { 69 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 70 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 71 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 72 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 73 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 74 } 75 if (left) { 76 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 77 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 78 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 79 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 80 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 81 82 83 } 84 PetscFunctionReturn(0); 85 } 86 87 88 #undef __FUNCT__ 89 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 90 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 91 { 92 /* This multiplication sequence is different sequence 93 than the CPU version. In particular, the diagonal block 94 multiplication kernel is launched in one stream. Then, 95 in a separate stream, the data transfers from DeviceToHost 96 (with MPI messaging in between), then HostToDevice are 97 launched. Once the data transfer stream is synchronized, 98 to ensure messaging is complete, the MatMultAdd kernel 99 is launched in the original (MatMult) stream to protect 100 against race conditions. 101 102 This sequence should only be called for GPU computation. */ 103 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 104 PetscErrorCode ierr; 105 PetscInt nt; 106 107 PetscFunctionBegin; 108 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 109 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 110 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 111 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 112 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 113 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 114 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 115 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 119 #undef __FUNCT__ 120 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 121 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 122 { 123 /* This multiplication sequence is different sequence 124 than the CPU version. In particular, the diagonal block 125 multiplication kernel is launched in one stream. Then, 126 in a separate stream, the data transfers from DeviceToHost 127 (with MPI messaging in between), then HostToDevice are 128 launched. Once the data transfer stream is synchronized, 129 to ensure messaging is complete, the MatMultAdd kernel 130 is launched in the original (MatMult) stream to protect 131 against race conditions. 132 133 This sequence should only be called for GPU computation. */ 134 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 135 PetscErrorCode ierr; 136 PetscInt nt; 137 138 PetscFunctionBegin; 139 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 140 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 141 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 142 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 143 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 144 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 145 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 146 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 147 PetscFunctionReturn(0); 148 } 149 150 #undef __FUNCT__ 151 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 152 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 153 { 154 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 155 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 156 157 PetscFunctionBegin; 158 switch (op) { 159 case MAT_CUSPARSE_MULT_DIAG: 160 cusparseStruct->diagGPUMatFormat = format; 161 break; 162 case MAT_CUSPARSE_MULT_OFFDIAG: 163 cusparseStruct->offdiagGPUMatFormat = format; 164 break; 165 case MAT_CUSPARSE_ALL: 166 cusparseStruct->diagGPUMatFormat = format; 167 cusparseStruct->offdiagGPUMatFormat = format; 168 break; 169 default: 170 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 171 } 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 177 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 178 { 179 MatCUSPARSEStorageFormat format; 180 PetscErrorCode ierr; 181 PetscBool flg; 182 183 PetscFunctionBegin; 184 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 185 ierr = PetscObjectOptionsBegin((PetscObject)A); 186 if (A->factortype==MAT_FACTOR_NONE) { 187 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 191 } 192 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 193 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 194 if (flg) { 195 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 196 } 197 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 198 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 199 if (flg) { 200 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 201 } 202 } 203 ierr = PetscOptionsEnd();CHKERRQ(ierr); 204 PetscFunctionReturn(0); 205 } 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 209 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 210 { 211 PetscErrorCode ierr; 212 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 213 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 214 cudaError_t err; 215 cusparseStatus_t stat; 216 217 PetscFunctionBegin; 218 try { 219 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 220 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 221 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat); 222 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err); 223 delete cusparseStruct; 224 } catch(char *ex) { 225 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 226 } 227 cusparseStruct = 0; 228 229 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 230 PetscFunctionReturn(0); 231 } 232 233 #undef __FUNCT__ 234 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 235 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 236 { 237 PetscErrorCode ierr; 238 Mat_MPIAIJ *a; 239 Mat_MPIAIJCUSPARSE * cusparseStruct; 240 cudaError_t err; 241 cusparseStatus_t stat; 242 243 PetscFunctionBegin; 244 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 245 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 246 a = (Mat_MPIAIJ*)A->data; 247 a->spptr = new Mat_MPIAIJCUSPARSE; 248 249 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 250 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 251 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 252 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat); 253 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err); 254 255 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 256 A->ops->mult = MatMult_MPIAIJCUSPARSE; 257 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 258 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 259 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 260 261 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 262 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 263 PetscFunctionReturn(0); 264 } 265 266 /*@ 267 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 268 (the default parallel PETSc format). This matrix will ultimately pushed down 269 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 270 assembly performance the user should preallocate the matrix storage by setting 271 the parameter nz (or the array nnz). By setting these parameters accurately, 272 performance during matrix assembly can be increased by more than a factor of 50. 273 274 Collective on MPI_Comm 275 276 Input Parameters: 277 + comm - MPI communicator, set to PETSC_COMM_SELF 278 . m - number of rows 279 . n - number of columns 280 . nz - number of nonzeros per row (same for all rows) 281 - nnz - array containing the number of nonzeros in the various rows 282 (possibly different for each row) or NULL 283 284 Output Parameter: 285 . A - the matrix 286 287 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 288 MatXXXXSetPreallocation() paradigm instead of this routine directly. 289 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 290 291 Notes: 292 If nnz is given then nz is ignored 293 294 The AIJ format (also called the Yale sparse matrix format or 295 compressed row storage), is fully compatible with standard Fortran 77 296 storage. That is, the stored row and column indices can begin at 297 either one (as in Fortran) or zero. See the users' manual for details. 298 299 Specify the preallocated storage with either nz or nnz (not both). 300 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 301 allocation. For large problems you MUST preallocate memory or you 302 will get TERRIBLE performance, see the users' manual chapter on matrices. 303 304 By default, this format uses inodes (identical nodes) when possible, to 305 improve numerical efficiency of matrix-vector products and solves. We 306 search for consecutive rows with the same nonzero structure, thereby 307 reusing matrix information to achieve increased efficiency. 308 309 Level: intermediate 310 311 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 312 @*/ 313 #undef __FUNCT__ 314 #define __FUNCT__ "MatCreateAIJCUSPARSE" 315 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 316 { 317 PetscErrorCode ierr; 318 PetscMPIInt size; 319 320 PetscFunctionBegin; 321 ierr = MatCreate(comm,A);CHKERRQ(ierr); 322 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 323 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 324 if (size > 1) { 325 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 326 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 327 } else { 328 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 329 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 330 } 331 PetscFunctionReturn(0); 332 } 333 334 /*M 335 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 336 337 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 338 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 339 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 340 341 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 342 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 343 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 344 for communicators controlling multiple processes. It is recommended that you call both of 345 the above preallocation routines for simplicity. 346 347 Options Database Keys: 348 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 349 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 350 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 351 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 352 353 Level: beginner 354 355 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 356 M 357 M*/ 358