1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 8 { 9 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 10 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 11 PetscErrorCode ierr; 12 PetscInt i; 13 14 PetscFunctionBegin; 15 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 16 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 17 if (d_nnz) { 18 for (i=0; i<B->rmap->n; i++) { 19 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 20 } 21 } 22 if (o_nnz) { 23 for (i=0; i<B->rmap->n; i++) { 24 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 25 } 26 } 27 if (!B->preallocated) { 28 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 29 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 30 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 31 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 32 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 33 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 35 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 37 } 38 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 39 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 40 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 41 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 42 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 46 47 B->preallocated = PETSC_TRUE; 48 PetscFunctionReturn(0); 49 } 50 51 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 52 { 53 PetscErrorCode ierr; 54 PetscInt rbs,cbs; 55 56 PetscFunctionBegin; 57 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 58 if (right) { 59 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 60 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 61 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 62 ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr); 63 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 64 } 65 if (left) { 66 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 67 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 68 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 69 ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr); 70 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 71 72 73 } 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 78 { 79 /* This multiplication sequence is different sequence 80 than the CPU version. In particular, the diagonal block 81 multiplication kernel is launched in one stream. Then, 82 in a separate stream, the data transfers from DeviceToHost 83 (with MPI messaging in between), then HostToDevice are 84 launched. Once the data transfer stream is synchronized, 85 to ensure messaging is complete, the MatMultAdd kernel 86 is launched in the original (MatMult) stream to protect 87 against race conditions. 88 89 This sequence should only be called for GPU computation. */ 90 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 91 PetscErrorCode ierr; 92 PetscInt nt; 93 94 PetscFunctionBegin; 95 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 96 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 97 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 98 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 99 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 100 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 101 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 102 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 103 PetscFunctionReturn(0); 104 } 105 106 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 107 { 108 /* This multiplication sequence is different sequence 109 than the CPU version. In particular, the diagonal block 110 multiplication kernel is launched in one stream. Then, 111 in a separate stream, the data transfers from DeviceToHost 112 (with MPI messaging in between), then HostToDevice are 113 launched. Once the data transfer stream is synchronized, 114 to ensure messaging is complete, the MatMultAdd kernel 115 is launched in the original (MatMult) stream to protect 116 against race conditions. 117 118 This sequence should only be called for GPU computation. */ 119 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 120 PetscErrorCode ierr; 121 PetscInt nt; 122 123 PetscFunctionBegin; 124 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 125 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 126 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 127 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 128 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 129 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 130 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 131 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 132 PetscFunctionReturn(0); 133 } 134 135 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 136 { 137 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 138 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 139 140 PetscFunctionBegin; 141 switch (op) { 142 case MAT_CUSPARSE_MULT_DIAG: 143 cusparseStruct->diagGPUMatFormat = format; 144 break; 145 case MAT_CUSPARSE_MULT_OFFDIAG: 146 cusparseStruct->offdiagGPUMatFormat = format; 147 break; 148 case MAT_CUSPARSE_ALL: 149 cusparseStruct->diagGPUMatFormat = format; 150 cusparseStruct->offdiagGPUMatFormat = format; 151 break; 152 default: 153 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 154 } 155 PetscFunctionReturn(0); 156 } 157 158 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 159 { 160 MatCUSPARSEStorageFormat format; 161 PetscErrorCode ierr; 162 PetscBool flg; 163 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 164 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 165 166 PetscFunctionBegin; 167 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 168 ierr = PetscObjectOptionsBegin((PetscObject)A); 169 if (A->factortype==MAT_FACTOR_NONE) { 170 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 171 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 172 if (flg) { 173 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 174 } 175 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 176 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 177 if (flg) { 178 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 179 } 180 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 181 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182 if (flg) { 183 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 184 } 185 } 186 ierr = PetscOptionsEnd();CHKERRQ(ierr); 187 PetscFunctionReturn(0); 188 } 189 190 PETSC_EXTERN PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat,MatAssemblyType); 191 192 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 193 { 194 PetscErrorCode ierr; 195 Mat_MPIAIJ *mpiaij; 196 197 PetscFunctionBegin; 198 mpiaij = (Mat_MPIAIJ*)A->data; 199 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 200 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 201 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 202 } 203 PetscFunctionReturn(0); 204 } 205 206 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 207 { 208 PetscErrorCode ierr; 209 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 210 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 211 cudaError_t err; 212 cusparseStatus_t stat; 213 214 PetscFunctionBegin; 215 try { 216 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 217 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 218 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 219 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 220 delete cusparseStruct; 221 } catch(char *ex) { 222 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 223 } 224 cusparseStruct = 0; 225 226 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 231 { 232 PetscErrorCode ierr; 233 Mat_MPIAIJ *a; 234 Mat_MPIAIJCUSPARSE * cusparseStruct; 235 cudaError_t err; 236 cusparseStatus_t stat; 237 238 PetscFunctionBegin; 239 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 240 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 241 a = (Mat_MPIAIJ*)A->data; 242 a->spptr = new Mat_MPIAIJCUSPARSE; 243 244 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 245 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 246 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 247 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 248 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 249 250 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 251 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 252 A->ops->mult = MatMult_MPIAIJCUSPARSE; 253 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 254 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 255 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 256 257 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 258 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 259 PetscFunctionReturn(0); 260 } 261 262 /*@ 263 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 264 (the default parallel PETSc format). This matrix will ultimately pushed down 265 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 266 assembly performance the user should preallocate the matrix storage by setting 267 the parameter nz (or the array nnz). By setting these parameters accurately, 268 performance during matrix assembly can be increased by more than a factor of 50. 269 270 Collective on MPI_Comm 271 272 Input Parameters: 273 + comm - MPI communicator, set to PETSC_COMM_SELF 274 . m - number of rows 275 . n - number of columns 276 . nz - number of nonzeros per row (same for all rows) 277 - nnz - array containing the number of nonzeros in the various rows 278 (possibly different for each row) or NULL 279 280 Output Parameter: 281 . A - the matrix 282 283 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 284 MatXXXXSetPreallocation() paradigm instead of this routine directly. 285 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 286 287 Notes: 288 If nnz is given then nz is ignored 289 290 The AIJ format (also called the Yale sparse matrix format or 291 compressed row storage), is fully compatible with standard Fortran 77 292 storage. That is, the stored row and column indices can begin at 293 either one (as in Fortran) or zero. See the users' manual for details. 294 295 Specify the preallocated storage with either nz or nnz (not both). 296 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 297 allocation. For large problems you MUST preallocate memory or you 298 will get TERRIBLE performance, see the users' manual chapter on matrices. 299 300 By default, this format uses inodes (identical nodes) when possible, to 301 improve numerical efficiency of matrix-vector products and solves. We 302 search for consecutive rows with the same nonzero structure, thereby 303 reusing matrix information to achieve increased efficiency. 304 305 Level: intermediate 306 307 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 308 @*/ 309 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 310 { 311 PetscErrorCode ierr; 312 PetscMPIInt size; 313 314 PetscFunctionBegin; 315 ierr = MatCreate(comm,A);CHKERRQ(ierr); 316 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 317 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 318 if (size > 1) { 319 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 320 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 321 } else { 322 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 323 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 324 } 325 PetscFunctionReturn(0); 326 } 327 328 /*M 329 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 330 331 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 332 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 333 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 334 335 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 336 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 337 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 338 for communicators controlling multiple processes. It is recommended that you call both of 339 the above preallocation routines for simplicity. 340 341 Options Database Keys: 342 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 343 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 344 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 345 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 346 347 Level: beginner 348 349 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 350 M 351 M*/ 352