1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 8 { 9 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 10 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 11 PetscErrorCode ierr; 12 PetscInt i; 13 14 PetscFunctionBegin; 15 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 16 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 17 if (d_nnz) { 18 for (i=0; i<B->rmap->n; i++) { 19 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 20 } 21 } 22 if (o_nnz) { 23 for (i=0; i<B->rmap->n; i++) { 24 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 25 } 26 } 27 if (!B->preallocated) { 28 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 29 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 30 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 31 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 32 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 33 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 35 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 37 } 38 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 39 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 40 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 41 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 42 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 46 47 B->preallocated = PETSC_TRUE; 48 PetscFunctionReturn(0); 49 } 50 51 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 52 { 53 PetscErrorCode ierr; 54 PetscInt rbs,cbs; 55 56 PetscFunctionBegin; 57 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 58 if (right) { 59 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 60 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 61 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 62 ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr); 63 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 64 } 65 if (left) { 66 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 67 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 68 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 69 ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr); 70 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 71 72 73 } 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 78 { 79 /* This multiplication sequence is different sequence 80 than the CPU version. In particular, the diagonal block 81 multiplication kernel is launched in one stream. Then, 82 in a separate stream, the data transfers from DeviceToHost 83 (with MPI messaging in between), then HostToDevice are 84 launched. Once the data transfer stream is synchronized, 85 to ensure messaging is complete, the MatMultAdd kernel 86 is launched in the original (MatMult) stream to protect 87 against race conditions. 88 89 This sequence should only be called for GPU computation. */ 90 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 91 PetscErrorCode ierr; 92 PetscInt nt; 93 94 PetscFunctionBegin; 95 PetscCheckTypeNames(xx,VECSEQCUDA,VECMPICUDA); 96 PetscCheckTypeNames(yy,VECSEQCUDA,VECMPICUDA); 97 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 98 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 99 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 100 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 101 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 102 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 103 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 104 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 105 PetscFunctionReturn(0); 106 } 107 108 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 109 { 110 /* This multiplication sequence is different sequence 111 than the CPU version. In particular, the diagonal block 112 multiplication kernel is launched in one stream. Then, 113 in a separate stream, the data transfers from DeviceToHost 114 (with MPI messaging in between), then HostToDevice are 115 launched. Once the data transfer stream is synchronized, 116 to ensure messaging is complete, the MatMultAdd kernel 117 is launched in the original (MatMult) stream to protect 118 against race conditions. 119 120 This sequence should only be called for GPU computation. */ 121 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 122 PetscErrorCode ierr; 123 PetscInt nt; 124 125 PetscFunctionBegin; 126 PetscCheckTypeNames(xx,VECSEQCUDA,VECMPICUDA); 127 PetscCheckTypeNames(yy,VECSEQCUDA,VECMPICUDA); 128 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 129 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 130 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 131 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 132 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 133 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 134 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 135 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 136 PetscFunctionReturn(0); 137 } 138 139 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 140 { 141 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 142 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 143 144 PetscFunctionBegin; 145 switch (op) { 146 case MAT_CUSPARSE_MULT_DIAG: 147 cusparseStruct->diagGPUMatFormat = format; 148 break; 149 case MAT_CUSPARSE_MULT_OFFDIAG: 150 cusparseStruct->offdiagGPUMatFormat = format; 151 break; 152 case MAT_CUSPARSE_ALL: 153 cusparseStruct->diagGPUMatFormat = format; 154 cusparseStruct->offdiagGPUMatFormat = format; 155 break; 156 default: 157 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 158 } 159 PetscFunctionReturn(0); 160 } 161 162 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 163 { 164 MatCUSPARSEStorageFormat format; 165 PetscErrorCode ierr; 166 PetscBool flg; 167 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 168 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 169 170 PetscFunctionBegin; 171 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 172 ierr = PetscObjectOptionsBegin((PetscObject)A); 173 if (A->factortype==MAT_FACTOR_NONE) { 174 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 175 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 176 if (flg) { 177 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 178 } 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 183 } 184 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 185 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 186 if (flg) { 187 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 188 } 189 } 190 ierr = PetscOptionsEnd();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 195 { 196 PetscErrorCode ierr; 197 Mat_MPIAIJ *mpiaij; 198 199 PetscFunctionBegin; 200 mpiaij = (Mat_MPIAIJ*)A->data; 201 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 202 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 203 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 204 } 205 PetscFunctionReturn(0); 206 } 207 208 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 209 { 210 PetscErrorCode ierr; 211 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 212 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 213 cudaError_t err; 214 cusparseStatus_t stat; 215 216 PetscFunctionBegin; 217 try { 218 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 219 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 220 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 221 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 222 delete cusparseStruct; 223 } catch(char *ex) { 224 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 225 } 226 cusparseStruct = 0; 227 228 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 233 { 234 PetscErrorCode ierr; 235 Mat_MPIAIJ *a; 236 Mat_MPIAIJCUSPARSE * cusparseStruct; 237 cudaError_t err; 238 cusparseStatus_t stat; 239 240 PetscFunctionBegin; 241 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 242 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 243 a = (Mat_MPIAIJ*)A->data; 244 a->spptr = new Mat_MPIAIJCUSPARSE; 245 246 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 247 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 248 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 249 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 250 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 251 252 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 253 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 254 A->ops->mult = MatMult_MPIAIJCUSPARSE; 255 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 256 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 257 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 258 259 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 260 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 /*@ 265 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 266 (the default parallel PETSc format). This matrix will ultimately pushed down 267 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 268 assembly performance the user should preallocate the matrix storage by setting 269 the parameter nz (or the array nnz). By setting these parameters accurately, 270 performance during matrix assembly can be increased by more than a factor of 50. 271 272 Collective on MPI_Comm 273 274 Input Parameters: 275 + comm - MPI communicator, set to PETSC_COMM_SELF 276 . m - number of rows 277 . n - number of columns 278 . nz - number of nonzeros per row (same for all rows) 279 - nnz - array containing the number of nonzeros in the various rows 280 (possibly different for each row) or NULL 281 282 Output Parameter: 283 . A - the matrix 284 285 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 286 MatXXXXSetPreallocation() paradigm instead of this routine directly. 287 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 288 289 Notes: 290 If nnz is given then nz is ignored 291 292 The AIJ format (also called the Yale sparse matrix format or 293 compressed row storage), is fully compatible with standard Fortran 77 294 storage. That is, the stored row and column indices can begin at 295 either one (as in Fortran) or zero. See the users' manual for details. 296 297 Specify the preallocated storage with either nz or nnz (not both). 298 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 299 allocation. For large problems you MUST preallocate memory or you 300 will get TERRIBLE performance, see the users' manual chapter on matrices. 301 302 By default, this format uses inodes (identical nodes) when possible, to 303 improve numerical efficiency of matrix-vector products and solves. We 304 search for consecutive rows with the same nonzero structure, thereby 305 reusing matrix information to achieve increased efficiency. 306 307 Level: intermediate 308 309 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 310 @*/ 311 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 312 { 313 PetscErrorCode ierr; 314 PetscMPIInt size; 315 316 PetscFunctionBegin; 317 ierr = MatCreate(comm,A);CHKERRQ(ierr); 318 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 319 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 320 if (size > 1) { 321 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 322 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 323 } else { 324 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 325 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 326 } 327 PetscFunctionReturn(0); 328 } 329 330 /*M 331 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 332 333 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 334 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 335 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 336 337 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 338 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 339 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 340 for communicators controlling multiple processes. It is recommended that you call both of 341 the above preallocation routines for simplicity. 342 343 Options Database Keys: 344 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 345 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 346 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 347 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 348 349 Level: beginner 350 351 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 352 M 353 M*/ 354