1 #include "petscconf.h" 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include "mpicusparsematimpl.h" 6 7 #undef __FUNCT__ 8 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 9 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 10 { 11 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 13 PetscErrorCode ierr; 14 PetscInt i; 15 16 PetscFunctionBegin; 17 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 18 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 19 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 20 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 21 22 ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 23 ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 25 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 26 if (d_nnz) { 27 for (i=0; i<B->rmap->n; i++) { 28 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 29 } 30 } 31 if (o_nnz) { 32 for (i=0; i<B->rmap->n; i++) { 33 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 34 } 35 } 36 if (!B->preallocated) { 37 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 38 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 40 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 42 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 43 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 44 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 45 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 46 } 47 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 48 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 58 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 59 { 60 PetscErrorCode ierr; 61 62 PetscFunctionBegin; 63 if (right) { 64 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 65 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 66 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 67 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 68 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 69 } 70 if (left) { 71 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 72 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 73 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 74 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 75 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 76 77 78 } 79 PetscFunctionReturn(0); 80 } 81 82 83 #undef __FUNCT__ 84 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 85 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 86 { 87 /* This multiplication sequence is different sequence 88 than the CPU version. In particular, the diagonal block 89 multiplication kernel is launched in one stream. Then, 90 in a separate stream, the data transfers from DeviceToHost 91 (with MPI messaging in between), then HostToDevice are 92 launched. Once the data transfer stream is synchronized, 93 to ensure messaging is complete, the MatMultAdd kernel 94 is launched in the original (MatMult) stream to protect 95 against race conditions. 96 97 This sequence should only be called for GPU computation. */ 98 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 99 PetscErrorCode ierr; 100 PetscInt nt; 101 102 PetscFunctionBegin; 103 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 104 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 105 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 107 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 108 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 109 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 110 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 111 PetscFunctionReturn(0); 112 } 113 114 #undef __FUNCT__ 115 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 116 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 117 { 118 /* This multiplication sequence is different sequence 119 than the CPU version. In particular, the diagonal block 120 multiplication kernel is launched in one stream. Then, 121 in a separate stream, the data transfers from DeviceToHost 122 (with MPI messaging in between), then HostToDevice are 123 launched. Once the data transfer stream is synchronized, 124 to ensure messaging is complete, the MatMultAdd kernel 125 is launched in the original (MatMult) stream to protect 126 against race conditions. 127 128 This sequence should only be called for GPU computation. */ 129 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 130 PetscErrorCode ierr; 131 PetscInt nt; 132 133 PetscFunctionBegin; 134 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 135 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 136 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 137 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 138 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 139 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 140 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 141 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 142 PetscFunctionReturn(0); 143 } 144 145 EXTERN_C_BEGIN 146 #undef __FUNCT__ 147 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 148 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 149 { 150 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 151 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 152 153 PetscFunctionBegin; 154 switch (op) { 155 case MAT_CUSPARSE_MULT_DIAG: 156 cusparseStruct->diagGPUMatFormat = format; 157 break; 158 case MAT_CUSPARSE_MULT_OFFDIAG: 159 cusparseStruct->offdiagGPUMatFormat = format; 160 break; 161 case MAT_CUSPARSE_ALL: 162 cusparseStruct->diagGPUMatFormat = format; 163 cusparseStruct->offdiagGPUMatFormat = format; 164 break; 165 default: 166 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 167 } 168 PetscFunctionReturn(0); 169 } 170 EXTERN_C_END 171 172 173 #undef __FUNCT__ 174 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 175 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 176 { 177 MatCUSPARSEStorageFormat format; 178 PetscErrorCode ierr; 179 PetscBool flg; 180 181 PetscFunctionBegin; 182 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 183 ierr = PetscObjectOptionsBegin((PetscObject)A); 184 if (A->factortype==MAT_FACTOR_NONE) { 185 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 191 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 192 if (flg) { 193 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 194 } 195 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 196 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197 if (flg) { 198 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 199 } 200 } 201 ierr = PetscOptionsEnd();CHKERRQ(ierr); 202 PetscFunctionReturn(0); 203 } 204 205 #undef __FUNCT__ 206 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 207 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 208 { 209 PetscErrorCode ierr; 210 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 211 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 212 213 PetscFunctionBegin; 214 try { 215 delete cusparseStruct; 216 } catch(char *ex) { 217 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 218 } 219 cusparseStruct = 0; 220 221 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 222 PetscFunctionReturn(0); 223 } 224 225 EXTERN_C_BEGIN 226 #undef __FUNCT__ 227 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 228 PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 229 { 230 PetscErrorCode ierr; 231 Mat_MPIAIJ *a; 232 Mat_MPIAIJCUSPARSE * cusparseStruct; 233 234 PetscFunctionBegin; 235 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 236 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C","MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 237 a = (Mat_MPIAIJ*)A->data; 238 a->spptr = new Mat_MPIAIJCUSPARSE; 239 240 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 241 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 242 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 243 244 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 245 A->ops->mult = MatMult_MPIAIJCUSPARSE; 246 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 247 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 248 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 249 250 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 251 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 252 PetscFunctionReturn(0); 253 } 254 EXTERN_C_END 255 256 /*@ 257 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 258 (the default parallel PETSc format). This matrix will ultimately pushed down 259 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 260 assembly performance the user should preallocate the matrix storage by setting 261 the parameter nz (or the array nnz). By setting these parameters accurately, 262 performance during matrix assembly can be increased by more than a factor of 50. 263 This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu 264 to build/install PETSc to use different CUSPARSE base matrix types. 265 266 Collective on MPI_Comm 267 268 Input Parameters: 269 + comm - MPI communicator, set to PETSC_COMM_SELF 270 . m - number of rows 271 . n - number of columns 272 . nz - number of nonzeros per row (same for all rows) 273 - nnz - array containing the number of nonzeros in the various rows 274 (possibly different for each row) or NULL 275 276 Output Parameter: 277 . A - the matrix 278 279 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 280 MatXXXXSetPreallocation() paradigm instead of this routine directly. 281 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 282 283 Notes: 284 If nnz is given then nz is ignored 285 286 The AIJ format (also called the Yale sparse matrix format or 287 compressed row storage), is fully compatible with standard Fortran 77 288 storage. That is, the stored row and column indices can begin at 289 either one (as in Fortran) or zero. See the users' manual for details. 290 291 Specify the preallocated storage with either nz or nnz (not both). 292 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 293 allocation. For large problems you MUST preallocate memory or you 294 will get TERRIBLE performance, see the users' manual chapter on matrices. 295 296 By default, this format uses inodes (identical nodes) when possible, to 297 improve numerical efficiency of matrix-vector products and solves. We 298 search for consecutive rows with the same nonzero structure, thereby 299 reusing matrix information to achieve increased efficiency. 300 301 Level: intermediate 302 303 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 304 @*/ 305 #undef __FUNCT__ 306 #define __FUNCT__ "MatCreateAIJCUSPARSE" 307 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 308 { 309 PetscErrorCode ierr; 310 PetscMPIInt size; 311 312 PetscFunctionBegin; 313 ierr = MatCreate(comm,A);CHKERRQ(ierr); 314 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 315 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 316 if (size > 1) { 317 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 318 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 319 } else { 320 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 321 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 322 } 323 PetscFunctionReturn(0); 324 } 325 326 /*M 327 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 328 329 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format. 330 All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the 331 CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available 332 in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different 333 GPU storage formats with CUSPARSE matrix types. 334 335 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 336 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 337 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 338 for communicators controlling multiple processes. It is recommended that you call both of 339 the above preallocation routines for simplicity. 340 341 Options Database Keys: 342 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 343 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 344 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 345 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 346 347 Level: beginner 348 349 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 350 M 351 M*/ 352