1 #include "petscconf.h" 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include "mpicusparsematimpl.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 9 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 10 { 11 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 13 PetscErrorCode ierr; 14 PetscInt i; 15 16 PetscFunctionBegin; 17 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 18 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 19 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 20 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 21 22 ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 23 ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 25 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 26 if (d_nnz) { 27 for (i=0; i<B->rmap->n; i++) { 28 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 29 } 30 } 31 if (o_nnz) { 32 for (i=0; i<B->rmap->n; i++) { 33 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 34 } 35 } 36 if (!B->preallocated) { 37 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 38 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 40 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 42 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 43 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 44 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 45 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 46 } 47 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 48 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49 ierr=MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50 ierr=MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 51 B->preallocated = PETSC_TRUE; 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 57 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 58 { 59 PetscErrorCode ierr; 60 61 PetscFunctionBegin; 62 if (right) { 63 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 64 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 65 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 66 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 67 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 68 } 69 if (left) { 70 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 71 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 72 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 73 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 74 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 75 } 76 PetscFunctionReturn(0); 77 } 78 79 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 82 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 83 { 84 /* This multiplication sequence is different sequence 85 than the CPU version. In particular, the diagonal block 86 multiplication kernel is launched in one stream. Then, 87 in a separate stream, the data transfers from DeviceToHost 88 (with MPI messaging in between), then HostToDevice are 89 launched. Once the data transfer stream is synchronized, 90 to ensure messaging is complete, the MatMultAdd kernel 91 is launched in the original (MatMult) stream to protect 92 against race conditions. 93 94 This sequence should only be called for GPU computation. */ 95 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 96 PetscErrorCode ierr; 97 PetscInt nt; 98 99 PetscFunctionBegin; 100 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 101 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 102 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 103 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 104 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 107 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 111 #undef __FUNCT__ 112 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 114 { 115 /* This multiplication sequence is different sequence 116 than the CPU version. In particular, the diagonal block 117 multiplication kernel is launched in one stream. Then, 118 in a separate stream, the data transfers from DeviceToHost 119 (with MPI messaging in between), then HostToDevice are 120 launched. Once the data transfer stream is synchronized, 121 to ensure messaging is complete, the MatMultAdd kernel 122 is launched in the original (MatMult) stream to protect 123 against race conditions. 124 125 This sequence should only be called for GPU computation. */ 126 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 127 PetscErrorCode ierr; 128 PetscInt nt; 129 130 PetscFunctionBegin; 131 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 132 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 133 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 134 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 135 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 136 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 137 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 138 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 139 PetscFunctionReturn(0); 140 } 141 142 EXTERN_C_BEGIN 143 #undef __FUNCT__ 144 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 145 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 146 { 147 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 148 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 149 150 PetscFunctionBegin; 151 switch (op) { 152 case MAT_CUSPARSE_MULT_DIAG: 153 cusparseStruct->diagGPUMatFormat = format; 154 break; 155 case MAT_CUSPARSE_MULT_OFFDIAG: 156 cusparseStruct->offdiagGPUMatFormat = format; 157 break; 158 case MAT_CUSPARSE_ALL: 159 cusparseStruct->diagGPUMatFormat = format; 160 cusparseStruct->offdiagGPUMatFormat = format; 161 break; 162 default: 163 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 164 } 165 PetscFunctionReturn(0); 166 } 167 EXTERN_C_END 168 169 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 172 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 173 { 174 MatCUSPARSEStorageFormat format; 175 PetscErrorCode ierr; 176 PetscBool flg; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 180 ierr = PetscObjectOptionsBegin((PetscObject)A); 181 if (A->factortype==MAT_FACTOR_NONE) { 182 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 183 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 184 if (flg) { 185 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 186 } 187 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 191 } 192 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 193 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 194 if (flg) { 195 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 196 } 197 } 198 ierr = PetscOptionsEnd();CHKERRQ(ierr); 199 PetscFunctionReturn(0); 200 } 201 202 #undef __FUNCT__ 203 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 204 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 205 { 206 PetscErrorCode ierr; 207 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 208 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 209 210 PetscFunctionBegin; 211 try { 212 delete cusparseStruct; 213 } catch(char* ex) { 214 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 215 } 216 cusparseStruct = 0; 217 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 218 PetscFunctionReturn(0); 219 } 220 221 EXTERN_C_BEGIN 222 #undef __FUNCT__ 223 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 224 PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 225 { 226 PetscErrorCode ierr; 227 Mat_MPIAIJ *a; 228 Mat_MPIAIJCUSPARSE * cusparseStruct; 229 230 PetscFunctionBegin; 231 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 232 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C", 233 "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE", 234 MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 235 a = (Mat_MPIAIJ*)A->data; 236 a->spptr = new Mat_MPIAIJCUSPARSE; 237 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 238 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 239 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 240 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 241 A->ops->mult = MatMult_MPIAIJCUSPARSE; 242 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 243 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 244 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 245 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 246 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 247 PetscFunctionReturn(0); 248 } 249 EXTERN_C_END 250 251 /*@ 252 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 253 (the default parallel PETSc format). This matrix will ultimately pushed down 254 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 255 assembly performance the user should preallocate the matrix storage by setting 256 the parameter nz (or the array nnz). By setting these parameters accurately, 257 performance during matrix assembly can be increased by more than a factor of 50. 258 This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu 259 to build/install PETSc to use different CUSPARSE base matrix types. 260 261 Collective on MPI_Comm 262 263 Input Parameters: 264 + comm - MPI communicator, set to PETSC_COMM_SELF 265 . m - number of rows 266 . n - number of columns 267 . nz - number of nonzeros per row (same for all rows) 268 - nnz - array containing the number of nonzeros in the various rows 269 (possibly different for each row) or PETSC_NULL 270 271 Output Parameter: 272 . A - the matrix 273 274 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 275 MatXXXXSetPreallocation() paradigm instead of this routine directly. 276 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 277 278 Notes: 279 If nnz is given then nz is ignored 280 281 The AIJ format (also called the Yale sparse matrix format or 282 compressed row storage), is fully compatible with standard Fortran 77 283 storage. That is, the stored row and column indices can begin at 284 either one (as in Fortran) or zero. See the users' manual for details. 285 286 Specify the preallocated storage with either nz or nnz (not both). 287 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 288 allocation. For large problems you MUST preallocate memory or you 289 will get TERRIBLE performance, see the users' manual chapter on matrices. 290 291 By default, this format uses inodes (identical nodes) when possible, to 292 improve numerical efficiency of matrix-vector products and solves. We 293 search for consecutive rows with the same nonzero structure, thereby 294 reusing matrix information to achieve increased efficiency. 295 296 Level: intermediate 297 298 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 299 @*/ 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatCreateAIJCUSPARSE" 302 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 303 { 304 PetscErrorCode ierr; 305 PetscMPIInt size; 306 307 PetscFunctionBegin; 308 ierr = MatCreate(comm,A);CHKERRQ(ierr); 309 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 310 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 311 if (size > 1) { 312 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 313 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 314 } else { 315 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 316 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 317 } 318 PetscFunctionReturn(0); 319 } 320 321 /*M 322 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 323 324 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format. 325 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. Use of the 326 CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available 327 in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different 328 GPU storage formats with CUSPARSE matrix types. 329 330 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 331 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 332 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 333 for communicators controlling multiple processes. It is recommended that you call both of 334 the above preallocation routines for simplicity. 335 336 Options Database Keys: 337 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 338 . -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). 339 . -mat_cusparse_mult_diag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal matrix during a call to MatSetFromOptions(). 340 - -mat_cusparse_mult_offdiag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). 341 342 Level: beginner 343 344 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 345 M*/ 346