1 #include "petscconf.h" 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include "mpicusparsematimpl.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 9 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 10 { 11 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 13 PetscErrorCode ierr; 14 PetscInt i; 15 16 PetscFunctionBegin; 17 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 18 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 19 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 20 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 21 22 ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 23 ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 25 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 26 if (d_nnz) { 27 for (i=0; i<B->rmap->n; i++) { 28 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 29 } 30 } 31 if (o_nnz) { 32 for (i=0; i<B->rmap->n; i++) { 33 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 34 } 35 } 36 if (!B->preallocated) { 37 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 38 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 40 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 42 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 43 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 44 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 45 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 46 } 47 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 48 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 58 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 59 { 60 PetscErrorCode ierr; 61 62 PetscFunctionBegin; 63 if (right) { 64 ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 65 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 66 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 67 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 68 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 69 } 70 if (left) { 71 ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 72 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 73 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 74 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 75 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 76 } 77 PetscFunctionReturn(0); 78 } 79 80 81 #undef __FUNCT__ 82 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 83 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 84 { 85 /* This multiplication sequence is different sequence 86 than the CPU version. In particular, the diagonal block 87 multiplication kernel is launched in one stream. Then, 88 in a separate stream, the data transfers from DeviceToHost 89 (with MPI messaging in between), then HostToDevice are 90 launched. Once the data transfer stream is synchronized, 91 to ensure messaging is complete, the MatMultAdd kernel 92 is launched in the original (MatMult) stream to protect 93 against race conditions. 94 95 This sequence should only be called for GPU computation. */ 96 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 97 PetscErrorCode ierr; 98 PetscInt nt; 99 100 PetscFunctionBegin; 101 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 102 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 103 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 104 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 105 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 107 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 108 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 109 PetscFunctionReturn(0); 110 } 111 112 #undef __FUNCT__ 113 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 114 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 115 { 116 /* This multiplication sequence is different sequence 117 than the CPU version. In particular, the diagonal block 118 multiplication kernel is launched in one stream. Then, 119 in a separate stream, the data transfers from DeviceToHost 120 (with MPI messaging in between), then HostToDevice are 121 launched. Once the data transfer stream is synchronized, 122 to ensure messaging is complete, the MatMultAdd kernel 123 is launched in the original (MatMult) stream to protect 124 against race conditions. 125 126 This sequence should only be called for GPU computation. */ 127 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 128 PetscErrorCode ierr; 129 PetscInt nt; 130 131 PetscFunctionBegin; 132 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 133 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 134 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 135 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 136 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 137 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 138 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 139 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 140 PetscFunctionReturn(0); 141 } 142 143 EXTERN_C_BEGIN 144 #undef __FUNCT__ 145 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 146 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 147 { 148 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 149 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 150 151 PetscFunctionBegin; 152 switch (op) { 153 case MAT_CUSPARSE_MULT_DIAG: 154 cusparseStruct->diagGPUMatFormat = format; 155 break; 156 case MAT_CUSPARSE_MULT_OFFDIAG: 157 cusparseStruct->offdiagGPUMatFormat = format; 158 break; 159 case MAT_CUSPARSE_ALL: 160 cusparseStruct->diagGPUMatFormat = format; 161 cusparseStruct->offdiagGPUMatFormat = format; 162 break; 163 default: 164 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 165 } 166 PetscFunctionReturn(0); 167 } 168 EXTERN_C_END 169 170 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 173 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 174 { 175 MatCUSPARSEStorageFormat format; 176 PetscErrorCode ierr; 177 PetscBool flg; 178 179 PetscFunctionBegin; 180 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 181 ierr = PetscObjectOptionsBegin((PetscObject)A); 182 if (A->factortype==MAT_FACTOR_NONE) { 183 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 184 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185 if (flg) { 186 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 187 } 188 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 192 } 193 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 194 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 195 if (flg) { 196 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 197 } 198 } 199 ierr = PetscOptionsEnd();CHKERRQ(ierr); 200 PetscFunctionReturn(0); 201 } 202 203 #undef __FUNCT__ 204 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 205 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 206 { 207 PetscErrorCode ierr; 208 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 209 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 210 211 PetscFunctionBegin; 212 try { 213 delete cusparseStruct; 214 } catch(char *ex) { 215 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 216 } 217 cusparseStruct = 0; 218 219 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 220 PetscFunctionReturn(0); 221 } 222 223 EXTERN_C_BEGIN 224 #undef __FUNCT__ 225 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 226 PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 227 { 228 PetscErrorCode ierr; 229 Mat_MPIAIJ *a; 230 Mat_MPIAIJCUSPARSE * cusparseStruct; 231 232 PetscFunctionBegin; 233 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 234 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C", 235 "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE", 236 MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 237 a = (Mat_MPIAIJ*)A->data; 238 a->spptr = new Mat_MPIAIJCUSPARSE; 239 240 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 241 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 242 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 243 244 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 245 A->ops->mult = MatMult_MPIAIJCUSPARSE; 246 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 247 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 248 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 249 250 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 251 ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 252 PetscFunctionReturn(0); 253 } 254 EXTERN_C_END 255 256 /*@ 257 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 258 (the default parallel PETSc format). This matrix will ultimately pushed down 259 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 260 assembly performance the user should preallocate the matrix storage by setting 261 the parameter nz (or the array nnz). By setting these parameters accurately, 262 performance during matrix assembly can be increased by more than a factor of 50. 263 This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu 264 to build/install PETSc to use different CUSPARSE base matrix types. 265 266 Collective on MPI_Comm 267 268 Input Parameters: 269 + comm - MPI communicator, set to PETSC_COMM_SELF 270 . m - number of rows 271 . n - number of columns 272 . nz - number of nonzeros per row (same for all rows) 273 - nnz - array containing the number of nonzeros in the various rows 274 (possibly different for each row) or PETSC_NULL 275 276 Output Parameter: 277 . A - the matrix 278 279 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 280 MatXXXXSetPreallocation() paradigm instead of this routine directly. 281 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 282 283 Notes: 284 If nnz is given then nz is ignored 285 286 The AIJ format (also called the Yale sparse matrix format or 287 compressed row storage), is fully compatible with standard Fortran 77 288 storage. That is, the stored row and column indices can begin at 289 either one (as in Fortran) or zero. See the users' manual for details. 290 291 Specify the preallocated storage with either nz or nnz (not both). 292 Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 293 allocation. For large problems you MUST preallocate memory or you 294 will get TERRIBLE performance, see the users' manual chapter on matrices. 295 296 By default, this format uses inodes (identical nodes) when possible, to 297 improve numerical efficiency of matrix-vector products and solves. We 298 search for consecutive rows with the same nonzero structure, thereby 299 reusing matrix information to achieve increased efficiency. 300 301 Level: intermediate 302 303 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 304 @*/ 305 #undef __FUNCT__ 306 #define __FUNCT__ "MatCreateAIJCUSPARSE" 307 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 308 { 309 PetscErrorCode ierr; 310 PetscMPIInt size; 311 312 PetscFunctionBegin; 313 ierr = MatCreate(comm,A);CHKERRQ(ierr); 314 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 315 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 316 if (size > 1) { 317 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 318 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 319 } else { 320 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 321 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 322 } 323 PetscFunctionReturn(0); 324 } 325 326 /*M 327 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 328 329 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format. 330 All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the 331 CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available 332 in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different 333 GPU storage formats with CUSPARSE matrix types. 334 335 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 336 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 337 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 338 for communicators controlling multiple processes. It is recommended that you call both of 339 the above preallocation routines for simplicity. 340 341 Options Database Keys: 342 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 343 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 344 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 345 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 346 347 Level: beginner 348 349 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 350 M 351 M*/ 352