1 #include "petscconf.h" 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include "mpicusparsematimpl.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 9 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 10 { 11 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 13 PetscErrorCode ierr; 14 PetscInt i; 15 16 PetscFunctionBegin; 17 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 18 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 19 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 20 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 21 22 ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 23 ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 25 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 26 if (d_nnz) { 27 for (i=0; i<B->rmap->n; i++) { 28 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 29 } 30 } 31 if (o_nnz) { 32 for (i=0; i<B->rmap->n; i++) { 33 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 34 } 35 } 36 if (!B->preallocated) { 37 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 38 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 40 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 42 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 43 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 44 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 45 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 46 } 47 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 48 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 58 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 59 { 60 PetscErrorCode ierr; 61 62 PetscFunctionBegin; 63 if (right) { 64 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 65 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 66 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 67 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 68 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 69 } 70 if (left) { 71 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 72 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 73 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 74 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 75 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 76 77 78 } 79 PetscFunctionReturn(0); 80 } 81 82 83 #undef __FUNCT__ 84 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 85 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 86 { 87 /* This multiplication sequence is different sequence 88 than the CPU version. In particular, the diagonal block 89 multiplication kernel is launched in one stream. Then, 90 in a separate stream, the data transfers from DeviceToHost 91 (with MPI messaging in between), then HostToDevice are 92 launched. Once the data transfer stream is synchronized, 93 to ensure messaging is complete, the MatMultAdd kernel 94 is launched in the original (MatMult) stream to protect 95 against race conditions. 96 97 This sequence should only be called for GPU computation. */ 98 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 99 PetscErrorCode ierr; 100 PetscInt nt; 101 102 PetscFunctionBegin; 103 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 104 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 105 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 107 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 108 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 109 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 110 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 111 PetscFunctionReturn(0); 112 } 113 114 #undef __FUNCT__ 115 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 116 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 117 { 118 /* This multiplication sequence is different sequence 119 than the CPU version. In particular, the diagonal block 120 multiplication kernel is launched in one stream. Then, 121 in a separate stream, the data transfers from DeviceToHost 122 (with MPI messaging in between), then HostToDevice are 123 launched. Once the data transfer stream is synchronized, 124 to ensure messaging is complete, the MatMultAdd kernel 125 is launched in the original (MatMult) stream to protect 126 against race conditions. 127 128 This sequence should only be called for GPU computation. */ 129 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 130 PetscErrorCode ierr; 131 PetscInt nt; 132 133 PetscFunctionBegin; 134 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 135 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 136 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 137 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 138 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 139 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 140 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 141 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 142 PetscFunctionReturn(0); 143 } 144 145 EXTERN_C_BEGIN 146 #undef __FUNCT__ 147 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 148 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 149 { 150 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 151 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 152 153 PetscFunctionBegin; 154 switch (op) { 155 case MAT_CUSPARSE_MULT_DIAG: 156 cusparseStruct->diagGPUMatFormat = format; 157 break; 158 case MAT_CUSPARSE_MULT_OFFDIAG: 159 cusparseStruct->offdiagGPUMatFormat = format; 160 break; 161 case MAT_CUSPARSE_ALL: 162 cusparseStruct->diagGPUMatFormat = format; 163 cusparseStruct->offdiagGPUMatFormat = format; 164 break; 165 default: 166 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 167 } 168 PetscFunctionReturn(0); 169 } 170 EXTERN_C_END 171 172 173 #undef __FUNCT__ 174 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 175 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 176 { 177 MatCUSPARSEStorageFormat format; 178 PetscErrorCode ierr; 179 PetscBool flg; 180 181 PetscFunctionBegin; 182 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 183 ierr = PetscObjectOptionsBegin((PetscObject)A); 184 if (A->factortype==MAT_FACTOR_NONE) { 185 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 191 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 192 if (flg) { 193 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 194 } 195 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 196 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197 if (flg) { 198 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 199 } 200 } 201 ierr = PetscOptionsEnd();CHKERRQ(ierr); 202 PetscFunctionReturn(0); 203 } 204 205 #undef __FUNCT__ 206 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 207 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 208 { 209 PetscErrorCode ierr; 210 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 211 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 212 213 PetscFunctionBegin; 214 try { 215 delete cusparseStruct; 216 } catch(char *ex) { 217 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 218 } 219 cusparseStruct = 0; 220 221 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 222 PetscFunctionReturn(0); 223 } 224 225 EXTERN_C_BEGIN 226 #undef __FUNCT__ 227 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 228 PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 229 { 230 PetscErrorCode ierr; 231 Mat_MPIAIJ *a; 232 Mat_MPIAIJCUSPARSE * cusparseStruct; 233 234 PetscFunctionBegin; 235 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 236 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C", 237 "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE", 238 MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 239 a = (Mat_MPIAIJ*)A->data; 240 a->spptr = new Mat_MPIAIJCUSPARSE; 241 242 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 243 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 244 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 245 246 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 247 A->ops->mult = MatMult_MPIAIJCUSPARSE; 248 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 249 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 250 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 251 252 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 253 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 254 PetscFunctionReturn(0); 255 } 256 EXTERN_C_END 257 258 /*@ 259 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 260 (the default parallel PETSc format). This matrix will ultimately pushed down 261 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 262 assembly performance the user should preallocate the matrix storage by setting 263 the parameter nz (or the array nnz). By setting these parameters accurately, 264 performance during matrix assembly can be increased by more than a factor of 50. 265 This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu 266 to build/install PETSc to use different CUSPARSE base matrix types. 267 268 Collective on MPI_Comm 269 270 Input Parameters: 271 + comm - MPI communicator, set to PETSC_COMM_SELF 272 . m - number of rows 273 . n - number of columns 274 . nz - number of nonzeros per row (same for all rows) 275 - nnz - array containing the number of nonzeros in the various rows 276 (possibly different for each row) or NULL 277 278 Output Parameter: 279 . A - the matrix 280 281 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 282 MatXXXXSetPreallocation() paradigm instead of this routine directly. 283 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 284 285 Notes: 286 If nnz is given then nz is ignored 287 288 The AIJ format (also called the Yale sparse matrix format or 289 compressed row storage), is fully compatible with standard Fortran 77 290 storage. That is, the stored row and column indices can begin at 291 either one (as in Fortran) or zero. See the users' manual for details. 292 293 Specify the preallocated storage with either nz or nnz (not both). 294 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 295 allocation. For large problems you MUST preallocate memory or you 296 will get TERRIBLE performance, see the users' manual chapter on matrices. 297 298 By default, this format uses inodes (identical nodes) when possible, to 299 improve numerical efficiency of matrix-vector products and solves. We 300 search for consecutive rows with the same nonzero structure, thereby 301 reusing matrix information to achieve increased efficiency. 302 303 Level: intermediate 304 305 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 306 @*/ 307 #undef __FUNCT__ 308 #define __FUNCT__ "MatCreateAIJCUSPARSE" 309 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 310 { 311 PetscErrorCode ierr; 312 PetscMPIInt size; 313 314 PetscFunctionBegin; 315 ierr = MatCreate(comm,A);CHKERRQ(ierr); 316 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 317 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 318 if (size > 1) { 319 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 320 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 321 } else { 322 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 323 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 324 } 325 PetscFunctionReturn(0); 326 } 327 328 /*M 329 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 330 331 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format. 332 All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the 333 CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available 334 in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different 335 GPU storage formats with CUSPARSE matrix types. 336 337 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 338 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 339 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 340 for communicators controlling multiple processes. It is recommended that you call both of 341 the above preallocation routines for simplicity. 342 343 Options Database Keys: 344 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 345 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 346 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 347 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 348 349 Level: beginner 350 351 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 352 M 353 M*/ 354