1 #include "petscconf.h" 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include "mpicusparsematimpl.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 9 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 10 { 11 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 13 PetscErrorCode ierr; 14 PetscInt i; 15 16 PetscFunctionBegin; 17 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 18 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 19 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 20 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 21 22 ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 23 ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 25 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 26 if (d_nnz) { 27 for (i=0; i<B->rmap->n; i++) { 28 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 29 } 30 } 31 if (o_nnz) { 32 for (i=0; i<B->rmap->n; i++) { 33 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 34 } 35 } 36 if (!B->preallocated) { 37 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 38 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 40 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 42 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 43 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 44 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 45 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 46 } 47 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 48 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 58 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 59 { 60 PetscErrorCode ierr; 61 62 PetscFunctionBegin; 63 if (right) { 64 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 65 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 66 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 67 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 68 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 69 } 70 if (left) { 71 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 72 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 73 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 74 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 75 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 76 77 78 } 79 PetscFunctionReturn(0); 80 } 81 82 83 #undef __FUNCT__ 84 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 85 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 86 { 87 /* This multiplication sequence is different sequence 88 than the CPU version. In particular, the diagonal block 89 multiplication kernel is launched in one stream. Then, 90 in a separate stream, the data transfers from DeviceToHost 91 (with MPI messaging in between), then HostToDevice are 92 launched. Once the data transfer stream is synchronized, 93 to ensure messaging is complete, the MatMultAdd kernel 94 is launched in the original (MatMult) stream to protect 95 against race conditions. 96 97 This sequence should only be called for GPU computation. */ 98 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 99 PetscErrorCode ierr; 100 PetscInt nt; 101 102 PetscFunctionBegin; 103 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 104 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 105 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 107 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 108 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 109 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 110 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 111 PetscFunctionReturn(0); 112 } 113 114 #undef __FUNCT__ 115 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 116 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 117 { 118 /* This multiplication sequence is different sequence 119 than the CPU version. In particular, the diagonal block 120 multiplication kernel is launched in one stream. Then, 121 in a separate stream, the data transfers from DeviceToHost 122 (with MPI messaging in between), then HostToDevice are 123 launched. Once the data transfer stream is synchronized, 124 to ensure messaging is complete, the MatMultAdd kernel 125 is launched in the original (MatMult) stream to protect 126 against race conditions. 127 128 This sequence should only be called for GPU computation. */ 129 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 130 PetscErrorCode ierr; 131 PetscInt nt; 132 133 PetscFunctionBegin; 134 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 135 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 136 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 137 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 138 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 139 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 140 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 141 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 142 PetscFunctionReturn(0); 143 } 144 145 #undef __FUNCT__ 146 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 147 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 148 { 149 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 150 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 151 152 PetscFunctionBegin; 153 switch (op) { 154 case MAT_CUSPARSE_MULT_DIAG: 155 cusparseStruct->diagGPUMatFormat = format; 156 break; 157 case MAT_CUSPARSE_MULT_OFFDIAG: 158 cusparseStruct->offdiagGPUMatFormat = format; 159 break; 160 case MAT_CUSPARSE_ALL: 161 cusparseStruct->diagGPUMatFormat = format; 162 cusparseStruct->offdiagGPUMatFormat = format; 163 break; 164 default: 165 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 166 } 167 PetscFunctionReturn(0); 168 } 169 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 172 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 173 { 174 MatCUSPARSEStorageFormat format; 175 PetscErrorCode ierr; 176 PetscBool flg; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 180 ierr = PetscObjectOptionsBegin((PetscObject)A); 181 if (A->factortype==MAT_FACTOR_NONE) { 182 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 183 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 184 if (flg) { 185 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 186 } 187 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 191 } 192 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 193 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 194 if (flg) { 195 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 196 } 197 } 198 ierr = PetscOptionsEnd();CHKERRQ(ierr); 199 PetscFunctionReturn(0); 200 } 201 202 #undef __FUNCT__ 203 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 204 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 205 { 206 PetscErrorCode ierr; 207 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 208 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 209 210 PetscFunctionBegin; 211 try { 212 delete cusparseStruct; 213 } catch(char *ex) { 214 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 215 } 216 cusparseStruct = 0; 217 218 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 #undef __FUNCT__ 223 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 224 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 225 { 226 PetscErrorCode ierr; 227 Mat_MPIAIJ *a; 228 Mat_MPIAIJCUSPARSE * cusparseStruct; 229 230 PetscFunctionBegin; 231 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 232 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 233 a = (Mat_MPIAIJ*)A->data; 234 a->spptr = new Mat_MPIAIJCUSPARSE; 235 236 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 237 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 238 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 239 240 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 241 A->ops->mult = MatMult_MPIAIJCUSPARSE; 242 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 243 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 244 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 245 246 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 247 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 248 PetscFunctionReturn(0); 249 } 250 251 /*@ 252 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 253 (the default parallel PETSc format). This matrix will ultimately pushed down 254 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 255 assembly performance the user should preallocate the matrix storage by setting 256 the parameter nz (or the array nnz). By setting these parameters accurately, 257 performance during matrix assembly can be increased by more than a factor of 50. 258 This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu 259 to build/install PETSc to use different CUSPARSE base matrix types. 260 261 Collective on MPI_Comm 262 263 Input Parameters: 264 + comm - MPI communicator, set to PETSC_COMM_SELF 265 . m - number of rows 266 . n - number of columns 267 . nz - number of nonzeros per row (same for all rows) 268 - nnz - array containing the number of nonzeros in the various rows 269 (possibly different for each row) or NULL 270 271 Output Parameter: 272 . A - the matrix 273 274 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 275 MatXXXXSetPreallocation() paradigm instead of this routine directly. 276 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 277 278 Notes: 279 If nnz is given then nz is ignored 280 281 The AIJ format (also called the Yale sparse matrix format or 282 compressed row storage), is fully compatible with standard Fortran 77 283 storage. That is, the stored row and column indices can begin at 284 either one (as in Fortran) or zero. See the users' manual for details. 285 286 Specify the preallocated storage with either nz or nnz (not both). 287 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 288 allocation. For large problems you MUST preallocate memory or you 289 will get TERRIBLE performance, see the users' manual chapter on matrices. 290 291 By default, this format uses inodes (identical nodes) when possible, to 292 improve numerical efficiency of matrix-vector products and solves. We 293 search for consecutive rows with the same nonzero structure, thereby 294 reusing matrix information to achieve increased efficiency. 295 296 Level: intermediate 297 298 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 299 @*/ 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatCreateAIJCUSPARSE" 302 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 303 { 304 PetscErrorCode ierr; 305 PetscMPIInt size; 306 307 PetscFunctionBegin; 308 ierr = MatCreate(comm,A);CHKERRQ(ierr); 309 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 310 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 311 if (size > 1) { 312 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 313 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 314 } else { 315 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 316 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 317 } 318 PetscFunctionReturn(0); 319 } 320 321 /*M 322 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 323 324 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format. 325 All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the 326 CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available 327 in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different 328 GPU storage formats with CUSPARSE matrix types. 329 330 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 331 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 332 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 333 for communicators controlling multiple processes. It is recommended that you call both of 334 the above preallocation routines for simplicity. 335 336 Options Database Keys: 337 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 338 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 339 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 340 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 341 342 Level: beginner 343 344 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 345 M 346 M*/ 347