1 #include "petscconf.h" 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include "mpicusparsematimpl.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 9 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 10 { 11 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 13 PetscErrorCode ierr; 14 PetscInt i; 15 16 PetscFunctionBegin; 17 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 18 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 19 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 20 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 21 22 ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 23 ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 25 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 26 if (d_nnz) { 27 for (i=0; i<B->rmap->n; i++) { 28 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 29 } 30 } 31 if (o_nnz) { 32 for (i=0; i<B->rmap->n; i++) { 33 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 34 } 35 } 36 if (!B->preallocated) { 37 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 38 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 40 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 42 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 43 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 44 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 45 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 46 } 47 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 48 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49 ierr=MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50 ierr=MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 51 B->preallocated = PETSC_TRUE; 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 57 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 58 { 59 PetscErrorCode ierr; 60 61 PetscFunctionBegin; 62 if (right) { 63 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 64 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 65 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 66 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 67 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 68 } 69 if (left) { 70 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 71 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 72 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 73 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 74 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 75 } 76 PetscFunctionReturn(0); 77 } 78 79 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 82 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 83 { 84 /* This multiplication sequence is different sequence 85 than the CPU version. In particular, the diagonal block 86 multiplication kernel is launched in one stream. Then, 87 in a separate stream, the data transfers from DeviceToHost 88 (with MPI messaging in between), then HostToDevice are 89 launched. Once the data transfer stream is synchronized, 90 to ensure messaging is complete, the MatMultAdd kernel 91 is launched in the original (MatMult) stream to protect 92 against race conditions. 93 94 This sequence should only be called for GPU computation. */ 95 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 96 PetscErrorCode ierr; 97 PetscInt nt; 98 99 PetscFunctionBegin; 100 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 101 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 102 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 103 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 104 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 107 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 111 #undef __FUNCT__ 112 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 114 { 115 /* This multiplication sequence is different sequence 116 than the CPU version. In particular, the diagonal block 117 multiplication kernel is launched in one stream. Then, 118 in a separate stream, the data transfers from DeviceToHost 119 (with MPI messaging in between), then HostToDevice are 120 launched. Once the data transfer stream is synchronized, 121 to ensure messaging is complete, the MatMultAdd kernel 122 is launched in the original (MatMult) stream to protect 123 against race conditions. 124 125 This sequence should only be called for GPU computation. */ 126 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 127 PetscErrorCode ierr; 128 PetscInt nt; 129 130 PetscFunctionBegin; 131 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 132 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 133 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 134 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 135 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 136 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 137 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 138 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 139 PetscFunctionReturn(0); 140 } 141 142 EXTERN_C_BEGIN 143 #undef __FUNCT__ 144 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 145 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 146 { 147 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 148 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 149 150 PetscFunctionBegin; 151 switch (op) { 152 case MAT_CUSPARSE_MULT_DIAG: 153 cusparseStruct->diagGPUMatFormat = format; 154 break; 155 case MAT_CUSPARSE_MULT_OFFDIAG: 156 cusparseStruct->offdiagGPUMatFormat = format; 157 break; 158 case MAT_CUSPARSE_ALL: 159 cusparseStruct->diagGPUMatFormat = format; 160 cusparseStruct->offdiagGPUMatFormat = format; 161 break; 162 default: 163 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 164 } 165 PetscFunctionReturn(0); 166 } 167 EXTERN_C_END 168 169 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 172 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 173 { 174 MatCUSPARSEStorageFormat format; 175 PetscErrorCode ierr; 176 PetscBool flg; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 180 ierr = PetscObjectOptionsBegin((PetscObject)A); 181 if (A->factortype==MAT_FACTOR_NONE) { 182 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 183 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 184 if (flg) { 185 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 186 } 187 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 191 } 192 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 193 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 194 if (flg) { 195 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 196 } 197 } 198 ierr = PetscOptionsEnd();CHKERRQ(ierr); 199 PetscFunctionReturn(0); 200 } 201 202 #undef __FUNCT__ 203 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 204 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 205 { 206 PetscErrorCode ierr; 207 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 208 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 209 210 PetscFunctionBegin; 211 try { 212 delete cusparseStruct; 213 } catch(char* ex) { 214 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 215 } 216 cusparseStruct = 0; 217 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 218 PetscFunctionReturn(0); 219 } 220 221 EXTERN_C_BEGIN 222 #undef __FUNCT__ 223 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 224 PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 225 { 226 PetscErrorCode ierr; 227 Mat_MPIAIJ *a; 228 Mat_MPIAIJCUSPARSE * cusparseStruct; 229 230 PetscFunctionBegin; 231 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 232 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C", 233 "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE", 234 MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 235 a = (Mat_MPIAIJ*)A->data; 236 a->spptr = new Mat_MPIAIJCUSPARSE; 237 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 238 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 239 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 240 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 241 A->ops->mult = MatMult_MPIAIJCUSPARSE; 242 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 243 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 244 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 245 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 246 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 247 248 PetscFunctionReturn(0); 249 } 250 EXTERN_C_END 251 252 /*@ 253 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 254 (the default parallel PETSc format). This matrix will ultimately pushed down 255 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 256 assembly performance the user should preallocate the matrix storage by setting 257 the parameter nz (or the array nnz). By setting these parameters accurately, 258 performance during matrix assembly can be increased by more than a factor of 50. 259 This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu 260 to build/install PETSc to use different CUSPARSE base matrix types. 261 262 Collective on MPI_Comm 263 264 Input Parameters: 265 + comm - MPI communicator, set to PETSC_COMM_SELF 266 . m - number of rows 267 . n - number of columns 268 . nz - number of nonzeros per row (same for all rows) 269 - nnz - array containing the number of nonzeros in the various rows 270 (possibly different for each row) or PETSC_NULL 271 272 Output Parameter: 273 . A - the matrix 274 275 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 276 MatXXXXSetPreallocation() paradigm instead of this routine directly. 277 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 278 279 Notes: 280 If nnz is given then nz is ignored 281 282 The AIJ format (also called the Yale sparse matrix format or 283 compressed row storage), is fully compatible with standard Fortran 77 284 storage. That is, the stored row and column indices can begin at 285 either one (as in Fortran) or zero. See the users' manual for details. 286 287 Specify the preallocated storage with either nz or nnz (not both). 288 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 289 allocation. For large problems you MUST preallocate memory or you 290 will get TERRIBLE performance, see the users' manual chapter on matrices. 291 292 By default, this format uses inodes (identical nodes) when possible, to 293 improve numerical efficiency of matrix-vector products and solves. We 294 search for consecutive rows with the same nonzero structure, thereby 295 reusing matrix information to achieve increased efficiency. 296 297 Level: intermediate 298 299 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 300 @*/ 301 #undef __FUNCT__ 302 #define __FUNCT__ "MatCreateAIJCUSPARSE" 303 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 304 { 305 PetscErrorCode ierr; 306 PetscMPIInt size; 307 308 PetscFunctionBegin; 309 ierr = MatCreate(comm,A);CHKERRQ(ierr); 310 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 311 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 312 if (size > 1) { 313 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 314 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 315 } else { 316 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 317 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 318 } 319 PetscFunctionReturn(0); 320 } 321 322 /*M 323 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 324 325 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format. 326 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. Use of the 327 CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available 328 in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different 329 GPU storage formats with CUSPARSE matrix types. 330 331 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 332 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 333 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 334 for communicators controlling multiple processes. It is recommended that you call both of 335 the above preallocation routines for simplicity. 336 337 Options Database Keys: 338 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 339 . -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). 340 . -mat_cusparse_mult_diag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal matrix during a call to MatSetFromOptions(). 341 - -mat_cusparse_mult_offdiag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). 342 343 Level: beginner 344 345 .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 346 M*/ 347