1 #define PETSC_SKIP_COMPLEX 2 #define PETSC_SKIP_SPINLOCK 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 11 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 12 { 13 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 14 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 15 PetscErrorCode ierr; 16 PetscInt i; 17 18 PetscFunctionBegin; 19 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 20 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 21 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 22 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 23 24 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 25 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 26 if (d_nnz) { 27 for (i=0; i<B->rmap->n; i++) { 28 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 29 } 30 } 31 if (o_nnz) { 32 for (i=0; i<B->rmap->n; i++) { 33 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 34 } 35 } 36 if (!B->preallocated) { 37 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 38 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 40 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 42 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 43 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 44 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 45 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 46 } 47 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 48 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 53 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 54 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 55 56 B->preallocated = PETSC_TRUE; 57 PetscFunctionReturn(0); 58 } 59 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE" 62 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 63 { 64 PetscErrorCode ierr; 65 PetscInt rbs,cbs; 66 67 PetscFunctionBegin; 68 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 69 if (right) { 70 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 71 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 72 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 73 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 74 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 75 } 76 if (left) { 77 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 78 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 79 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 80 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 81 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 82 83 84 } 85 PetscFunctionReturn(0); 86 } 87 88 89 #undef __FUNCT__ 90 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 91 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 92 { 93 /* This multiplication sequence is different sequence 94 than the CPU version. In particular, the diagonal block 95 multiplication kernel is launched in one stream. Then, 96 in a separate stream, the data transfers from DeviceToHost 97 (with MPI messaging in between), then HostToDevice are 98 launched. Once the data transfer stream is synchronized, 99 to ensure messaging is complete, the MatMultAdd kernel 100 is launched in the original (MatMult) stream to protect 101 against race conditions. 102 103 This sequence should only be called for GPU computation. */ 104 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 105 PetscErrorCode ierr; 106 PetscInt nt; 107 108 PetscFunctionBegin; 109 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 110 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 111 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 112 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 113 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 114 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 115 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 116 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 117 PetscFunctionReturn(0); 118 } 119 120 #undef __FUNCT__ 121 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 122 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 123 { 124 /* This multiplication sequence is different sequence 125 than the CPU version. In particular, the diagonal block 126 multiplication kernel is launched in one stream. Then, 127 in a separate stream, the data transfers from DeviceToHost 128 (with MPI messaging in between), then HostToDevice are 129 launched. Once the data transfer stream is synchronized, 130 to ensure messaging is complete, the MatMultAdd kernel 131 is launched in the original (MatMult) stream to protect 132 against race conditions. 133 134 This sequence should only be called for GPU computation. */ 135 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 136 PetscErrorCode ierr; 137 PetscInt nt; 138 139 PetscFunctionBegin; 140 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 141 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 142 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 143 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 144 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 145 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 146 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 147 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 148 PetscFunctionReturn(0); 149 } 150 151 #undef __FUNCT__ 152 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 153 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 154 { 155 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 156 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 157 158 PetscFunctionBegin; 159 switch (op) { 160 case MAT_CUSPARSE_MULT_DIAG: 161 cusparseStruct->diagGPUMatFormat = format; 162 break; 163 case MAT_CUSPARSE_MULT_OFFDIAG: 164 cusparseStruct->offdiagGPUMatFormat = format; 165 break; 166 case MAT_CUSPARSE_ALL: 167 cusparseStruct->diagGPUMatFormat = format; 168 cusparseStruct->offdiagGPUMatFormat = format; 169 break; 170 default: 171 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 172 } 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 178 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 179 { 180 MatCUSPARSEStorageFormat format; 181 PetscErrorCode ierr; 182 PetscBool flg; 183 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 184 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 185 186 PetscFunctionBegin; 187 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 188 ierr = PetscObjectOptionsBegin((PetscObject)A); 189 if (A->factortype==MAT_FACTOR_NONE) { 190 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 191 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 192 if (flg) { 193 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 194 } 195 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 196 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197 if (flg) { 198 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 199 } 200 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 201 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 202 if (flg) { 203 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 204 } 205 } 206 ierr = PetscOptionsEnd();CHKERRQ(ierr); 207 PetscFunctionReturn(0); 208 } 209 210 #undef __FUNCT__ 211 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 212 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 213 { 214 PetscErrorCode ierr; 215 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 216 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 217 cudaError_t err; 218 cusparseStatus_t stat; 219 220 PetscFunctionBegin; 221 try { 222 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 223 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 224 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat); 225 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err); 226 delete cusparseStruct; 227 } catch(char *ex) { 228 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 229 } 230 cusparseStruct = 0; 231 232 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 233 PetscFunctionReturn(0); 234 } 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 238 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 239 { 240 PetscErrorCode ierr; 241 Mat_MPIAIJ *a; 242 Mat_MPIAIJCUSPARSE * cusparseStruct; 243 cudaError_t err; 244 cusparseStatus_t stat; 245 246 PetscFunctionBegin; 247 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 248 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 249 a = (Mat_MPIAIJ*)A->data; 250 a->spptr = new Mat_MPIAIJCUSPARSE; 251 252 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 253 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 254 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 255 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat); 256 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err); 257 258 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 259 A->ops->mult = MatMult_MPIAIJCUSPARSE; 260 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 261 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 262 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 263 264 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 265 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 266 PetscFunctionReturn(0); 267 } 268 269 /*@ 270 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 271 (the default parallel PETSc format). This matrix will ultimately pushed down 272 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 273 assembly performance the user should preallocate the matrix storage by setting 274 the parameter nz (or the array nnz). By setting these parameters accurately, 275 performance during matrix assembly can be increased by more than a factor of 50. 276 277 Collective on MPI_Comm 278 279 Input Parameters: 280 + comm - MPI communicator, set to PETSC_COMM_SELF 281 . m - number of rows 282 . n - number of columns 283 . nz - number of nonzeros per row (same for all rows) 284 - nnz - array containing the number of nonzeros in the various rows 285 (possibly different for each row) or NULL 286 287 Output Parameter: 288 . A - the matrix 289 290 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 291 MatXXXXSetPreallocation() paradigm instead of this routine directly. 292 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 293 294 Notes: 295 If nnz is given then nz is ignored 296 297 The AIJ format (also called the Yale sparse matrix format or 298 compressed row storage), is fully compatible with standard Fortran 77 299 storage. That is, the stored row and column indices can begin at 300 either one (as in Fortran) or zero. See the users' manual for details. 301 302 Specify the preallocated storage with either nz or nnz (not both). 303 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 304 allocation. For large problems you MUST preallocate memory or you 305 will get TERRIBLE performance, see the users' manual chapter on matrices. 306 307 By default, this format uses inodes (identical nodes) when possible, to 308 improve numerical efficiency of matrix-vector products and solves. We 309 search for consecutive rows with the same nonzero structure, thereby 310 reusing matrix information to achieve increased efficiency. 311 312 Level: intermediate 313 314 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 315 @*/ 316 #undef __FUNCT__ 317 #define __FUNCT__ "MatCreateAIJCUSPARSE" 318 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 319 { 320 PetscErrorCode ierr; 321 PetscMPIInt size; 322 323 PetscFunctionBegin; 324 ierr = MatCreate(comm,A);CHKERRQ(ierr); 325 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 326 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 327 if (size > 1) { 328 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 329 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 330 } else { 331 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 332 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 333 } 334 PetscFunctionReturn(0); 335 } 336 337 /*M 338 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 339 340 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 341 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 342 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 343 344 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 345 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 346 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 347 for communicators controlling multiple processes. It is recommended that you call both of 348 the above preallocation routines for simplicity. 349 350 Options Database Keys: 351 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 352 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 353 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 354 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 355 356 Level: beginner 357 358 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 359 M 360 M*/ 361