1 #include "petscconf.h" 2 PETSC_CUDA_EXTERN_C_BEGIN 3 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 4 PETSC_CUDA_EXTERN_C_END 5 #include "mpicusparsematimpl.h" 6 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 19 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 20 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 21 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 22 23 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 24 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 25 if (d_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 28 } 29 } 30 if (o_nnz) { 31 for (i=0; i<B->rmap->n; i++) { 32 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 33 } 34 } 35 if (!B->preallocated) { 36 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 37 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 38 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 39 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 40 ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 41 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 42 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 43 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 44 ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 45 } 46 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 47 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 53 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 54 55 B->preallocated = PETSC_TRUE; 56 PetscFunctionReturn(0); 57 } 58 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 61 PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 62 { 63 PetscErrorCode ierr; 64 65 PetscFunctionBegin; 66 if (right) { 67 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 68 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 69 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 70 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 71 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 72 } 73 if (left) { 74 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 75 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 76 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 77 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 78 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 79 80 81 } 82 PetscFunctionReturn(0); 83 } 84 85 86 #undef __FUNCT__ 87 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 88 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 89 { 90 /* This multiplication sequence is different sequence 91 than the CPU version. In particular, the diagonal block 92 multiplication kernel is launched in one stream. Then, 93 in a separate stream, the data transfers from DeviceToHost 94 (with MPI messaging in between), then HostToDevice are 95 launched. Once the data transfer stream is synchronized, 96 to ensure messaging is complete, the MatMultAdd kernel 97 is launched in the original (MatMult) stream to protect 98 against race conditions. 99 100 This sequence should only be called for GPU computation. */ 101 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 102 PetscErrorCode ierr; 103 PetscInt nt; 104 105 PetscFunctionBegin; 106 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 107 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 108 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 109 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 110 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 111 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 112 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 113 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 119 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 120 { 121 /* This multiplication sequence is different sequence 122 than the CPU version. In particular, the diagonal block 123 multiplication kernel is launched in one stream. Then, 124 in a separate stream, the data transfers from DeviceToHost 125 (with MPI messaging in between), then HostToDevice are 126 launched. Once the data transfer stream is synchronized, 127 to ensure messaging is complete, the MatMultAdd kernel 128 is launched in the original (MatMult) stream to protect 129 against race conditions. 130 131 This sequence should only be called for GPU computation. */ 132 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 133 PetscErrorCode ierr; 134 PetscInt nt; 135 136 PetscFunctionBegin; 137 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 138 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 139 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 140 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 141 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 142 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 143 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 144 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 145 PetscFunctionReturn(0); 146 } 147 148 #undef __FUNCT__ 149 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 150 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 151 { 152 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 153 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 154 155 PetscFunctionBegin; 156 switch (op) { 157 case MAT_CUSPARSE_MULT_DIAG: 158 cusparseStruct->diagGPUMatFormat = format; 159 break; 160 case MAT_CUSPARSE_MULT_OFFDIAG: 161 cusparseStruct->offdiagGPUMatFormat = format; 162 break; 163 case MAT_CUSPARSE_ALL: 164 cusparseStruct->diagGPUMatFormat = format; 165 cusparseStruct->offdiagGPUMatFormat = format; 166 break; 167 default: 168 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 169 } 170 PetscFunctionReturn(0); 171 } 172 173 #undef __FUNCT__ 174 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 175 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 176 { 177 MatCUSPARSEStorageFormat format; 178 PetscErrorCode ierr; 179 PetscBool flg; 180 181 PetscFunctionBegin; 182 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 183 ierr = PetscObjectOptionsBegin((PetscObject)A); 184 if (A->factortype==MAT_FACTOR_NONE) { 185 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 189 } 190 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 191 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 192 if (flg) { 193 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 194 } 195 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 196 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197 if (flg) { 198 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 199 } 200 } 201 ierr = PetscOptionsEnd();CHKERRQ(ierr); 202 PetscFunctionReturn(0); 203 } 204 205 #undef __FUNCT__ 206 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 207 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 208 { 209 PetscErrorCode ierr; 210 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 211 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 212 cudaError_t err; 213 cusparseStatus_t stat; 214 215 PetscFunctionBegin; 216 try { 217 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 218 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 219 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat); 220 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err); 221 delete cusparseStruct; 222 } catch(char *ex) { 223 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 224 } 225 cusparseStruct = 0; 226 227 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 #undef __FUNCT__ 232 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 233 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 234 { 235 PetscErrorCode ierr; 236 Mat_MPIAIJ *a; 237 Mat_MPIAIJCUSPARSE * cusparseStruct; 238 cudaError_t err; 239 cusparseStatus_t stat; 240 241 PetscFunctionBegin; 242 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 243 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 244 a = (Mat_MPIAIJ*)A->data; 245 a->spptr = new Mat_MPIAIJCUSPARSE; 246 247 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 248 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 249 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 250 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat); 251 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err); 252 253 A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 254 A->ops->mult = MatMult_MPIAIJCUSPARSE; 255 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 256 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 257 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 258 259 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 260 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 /*@ 265 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 266 (the default parallel PETSc format). This matrix will ultimately pushed down 267 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 268 assembly performance the user should preallocate the matrix storage by setting 269 the parameter nz (or the array nnz). By setting these parameters accurately, 270 performance during matrix assembly can be increased by more than a factor of 50. 271 272 Collective on MPI_Comm 273 274 Input Parameters: 275 + comm - MPI communicator, set to PETSC_COMM_SELF 276 . m - number of rows 277 . n - number of columns 278 . nz - number of nonzeros per row (same for all rows) 279 - nnz - array containing the number of nonzeros in the various rows 280 (possibly different for each row) or NULL 281 282 Output Parameter: 283 . A - the matrix 284 285 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 286 MatXXXXSetPreallocation() paradigm instead of this routine directly. 287 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 288 289 Notes: 290 If nnz is given then nz is ignored 291 292 The AIJ format (also called the Yale sparse matrix format or 293 compressed row storage), is fully compatible with standard Fortran 77 294 storage. That is, the stored row and column indices can begin at 295 either one (as in Fortran) or zero. See the users' manual for details. 296 297 Specify the preallocated storage with either nz or nnz (not both). 298 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 299 allocation. For large problems you MUST preallocate memory or you 300 will get TERRIBLE performance, see the users' manual chapter on matrices. 301 302 By default, this format uses inodes (identical nodes) when possible, to 303 improve numerical efficiency of matrix-vector products and solves. We 304 search for consecutive rows with the same nonzero structure, thereby 305 reusing matrix information to achieve increased efficiency. 306 307 Level: intermediate 308 309 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 310 @*/ 311 #undef __FUNCT__ 312 #define __FUNCT__ "MatCreateAIJCUSPARSE" 313 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 314 { 315 PetscErrorCode ierr; 316 PetscMPIInt size; 317 318 PetscFunctionBegin; 319 ierr = MatCreate(comm,A);CHKERRQ(ierr); 320 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 321 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 322 if (size > 1) { 323 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 324 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 325 } else { 326 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 327 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 328 } 329 PetscFunctionReturn(0); 330 } 331 332 /*M 333 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 334 335 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 336 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 337 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 338 339 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 340 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 341 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 342 for communicators controlling multiple processes. It is recommended that you call both of 343 the above preallocation routines for simplicity. 344 345 Options Database Keys: 346 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 347 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 348 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 349 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 350 351 Level: beginner 352 353 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 354 M 355 M*/ 356