1 #include "petscconf.h" 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include "mpicusparsematimpl.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 9 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 10 { 11 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 13 PetscErrorCode ierr; 14 PetscInt i; 15 16 PetscFunctionBegin; 17 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 18 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 19 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 20 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 21 22 ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 23 ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 25 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 26 if (d_nnz) { 27 for (i=0; i<B->rmap->n; i++) { 28 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 29 } 30 } 31 if (o_nnz) { 32 for (i=0; i<B->rmap->n; i++) { 33 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 34 } 35 } 36 if (!B->preallocated) { 37 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 38 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 40 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 42 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 43 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 44 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 45 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 46 } 47 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 48 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49 ierr=MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50 ierr=MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 51 B->preallocated = PETSC_TRUE; 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 57 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 58 { 59 PetscErrorCode ierr; 60 61 PetscFunctionBegin; 62 if (right) { 63 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 64 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 65 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 66 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 67 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 68 } 69 if (left) { 70 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 71 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 72 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 73 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 74 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 75 } 76 PetscFunctionReturn(0); 77 } 78 79 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 82 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 83 { 84 /* This multiplication sequence is different sequence 85 than the CPU version. In particular, the diagonal block 86 multiplication kernel is launched in one stream. Then, 87 in a separate stream, the data transfers from DeviceToHost 88 (with MPI messaging in between), then HostToDevice are 89 launched. Once the data transfer stream is synchronized, 90 to ensure messaging is complete, the MatMultAdd kernel 91 is launched in the original (MatMult) stream to protect 92 against race conditions. 93 94 This sequence should only be called for GPU computation. */ 95 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 96 PetscErrorCode ierr; 97 PetscInt nt; 98 99 PetscFunctionBegin; 100 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 101 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 102 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 103 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 104 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 107 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 111 #undef __FUNCT__ 112 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 114 { 115 /* This multiplication sequence is different sequence 116 than the CPU version. In particular, the diagonal block 117 multiplication kernel is launched in one stream. Then, 118 in a separate stream, the data transfers from DeviceToHost 119 (with MPI messaging in between), then HostToDevice are 120 launched. Once the data transfer stream is synchronized, 121 to ensure messaging is complete, the MatMultAdd kernel 122 is launched in the original (MatMult) stream to protect 123 against race conditions. 124 125 This sequence should only be called for GPU computation. */ 126 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 127 PetscErrorCode ierr; 128 PetscInt nt; 129 130 PetscFunctionBegin; 131 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 132 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 133 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 134 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 135 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 136 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 137 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 138 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 139 PetscFunctionReturn(0); 140 } 141 142 /*PetscErrorCode MatSetValuesBatch_MPIAIJCUSPARSE(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats); */ 143 144 EXTERN_C_BEGIN 145 #undef __FUNCT__ 146 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 147 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 148 { 149 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 150 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 151 152 PetscFunctionBegin; 153 switch (op) { 154 case MAT_CUSPARSE_MULT_DIAG: 155 cusparseStruct->diagGPUMatFormat = format; 156 break; 157 case MAT_CUSPARSE_MULT_OFFDIAG: 158 cusparseStruct->offdiagGPUMatFormat = format; 159 break; 160 case MAT_CUSPARSE_ALL: 161 cusparseStruct->diagGPUMatFormat = format; 162 cusparseStruct->offdiagGPUMatFormat = format; 163 break; 164 default: 165 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 166 } 167 PetscFunctionReturn(0); 168 } 169 EXTERN_C_END 170 171 172 #undef __FUNCT__ 173 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 174 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 175 { 176 MatCUSPARSEStorageFormat format; 177 PetscErrorCode ierr; 178 PetscBool flg; 179 PetscFunctionBegin; 180 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 181 ierr = PetscObjectOptionsBegin((PetscObject)A); 182 if (A->factortype==MAT_FACTOR_NONE) { 183 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 184 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185 if (flg) { 186 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 187 } 188 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 192 } 193 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 194 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 195 if (flg) { 196 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 197 } 198 } 199 ierr = PetscOptionsEnd();CHKERRQ(ierr); 200 PetscFunctionReturn(0); 201 } 202 203 #undef __FUNCT__ 204 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 205 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 206 { 207 PetscErrorCode ierr; 208 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 209 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 210 211 PetscFunctionBegin; 212 try { 213 delete cusparseStruct; 214 } catch(char* ex) { 215 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 216 } 217 cusparseStruct = 0; 218 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 EXTERN_C_BEGIN 223 #undef __FUNCT__ 224 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 225 PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 226 { 227 PetscErrorCode ierr; 228 Mat_MPIAIJ *a; 229 Mat_MPIAIJCUSPARSE * cusparseStruct; 230 231 PetscFunctionBegin; 232 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 233 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C", 234 "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE", 235 MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 236 a = (Mat_MPIAIJ*)A->data; 237 a->spptr = new Mat_MPIAIJCUSPARSE; 238 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 239 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 240 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 241 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 242 A->ops->mult = MatMult_MPIAIJCUSPARSE; 243 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 244 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 245 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 246 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 247 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 248 249 PetscFunctionReturn(0); 250 } 251 EXTERN_C_END 252 253 /*@ 254 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 255 (the default parallel PETSc format). This matrix will ultimately pushed down 256 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 257 assembly performance the user should preallocate the matrix storage by setting 258 the parameter nz (or the array nnz). By setting these parameters accurately, 259 performance during matrix assembly can be increased by more than a factor of 50. 260 This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu 261 to build/install PETSc to use different CUSPARSE base matrix types. 262 263 Collective on MPI_Comm 264 265 Input Parameters: 266 + comm - MPI communicator, set to PETSC_COMM_SELF 267 . m - number of rows 268 . n - number of columns 269 . nz - number of nonzeros per row (same for all rows) 270 - nnz - array containing the number of nonzeros in the various rows 271 (possibly different for each row) or PETSC_NULL 272 273 Output Parameter: 274 . A - the matrix 275 276 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 277 MatXXXXSetPreallocation() paradigm instead of this routine directly. 278 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 279 280 Notes: 281 If nnz is given then nz is ignored 282 283 The AIJ format (also called the Yale sparse matrix format or 284 compressed row storage), is fully compatible with standard Fortran 77 285 storage. That is, the stored row and column indices can begin at 286 either one (as in Fortran) or zero. See the users' manual for details. 287 288 Specify the preallocated storage with either nz or nnz (not both). 289 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 290 allocation. For large problems you MUST preallocate memory or you 291 will get TERRIBLE performance, see the users' manual chapter on matrices. 292 293 By default, this format uses inodes (identical nodes) when possible, to 294 improve numerical efficiency of matrix-vector products and solves. We 295 search for consecutive rows with the same nonzero structure, thereby 296 reusing matrix information to achieve increased efficiency. 297 298 Level: intermediate 299 300 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 301 @*/ 302 #undef __FUNCT__ 303 #define __FUNCT__ "MatCreateAIJCUSPARSE" 304 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 305 { 306 PetscErrorCode ierr; 307 PetscMPIInt size; 308 309 PetscFunctionBegin; 310 ierr = MatCreate(comm,A);CHKERRQ(ierr); 311 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 312 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 313 if (size > 1) { 314 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 315 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 316 } else { 317 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 318 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 319 } 320 PetscFunctionReturn(0); 321 } 322 323 /*M 324 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 325 326 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format. 327 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. Use of the 328 CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available 329 in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different 330 GPU storage formats with CUSPARSE matrix types. 331 332 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 333 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 334 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 335 for communicators controlling multiple processes. It is recommended that you call both of 336 the above preallocation routines for simplicity. 337 338 Options Database Keys: 339 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 340 . -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). 341 . -mat_cusparse_mult_diag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal matrix during a call to MatSetFromOptions(). 342 - -mat_cusparse_mult_offdiag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). 343 344 Level: beginner 345 346 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 347 M*/ 348