1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 11 12 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 13 { 14 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 15 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 16 PetscErrorCode ierr; 17 PetscInt i; 18 19 PetscFunctionBegin; 20 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 21 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 22 if (d_nnz) { 23 for (i=0; i<B->rmap->n; i++) { 24 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 25 } 26 } 27 if (o_nnz) { 28 for (i=0; i<B->rmap->n; i++) { 29 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 30 } 31 } 32 if (!B->preallocated) { 33 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 34 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 35 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 36 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 37 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 38 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 39 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 40 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 41 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 42 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 43 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 44 } 45 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 46 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 53 54 B->preallocated = PETSC_TRUE; 55 PetscFunctionReturn(0); 56 } 57 58 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 59 { 60 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 61 PetscErrorCode ierr; 62 PetscInt nt; 63 64 PetscFunctionBegin; 65 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 66 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 67 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 68 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 69 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 70 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 71 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 72 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 77 { 78 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 79 PetscErrorCode ierr; 80 PetscInt nt; 81 82 PetscFunctionBegin; 83 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 84 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 85 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 86 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 88 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 89 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 90 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 95 { 96 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 97 PetscErrorCode ierr; 98 PetscInt nt; 99 100 PetscFunctionBegin; 101 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 102 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 103 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 104 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 105 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 106 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 107 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 108 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 109 PetscFunctionReturn(0); 110 } 111 112 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 113 { 114 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 115 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 116 117 PetscFunctionBegin; 118 switch (op) { 119 case MAT_CUSPARSE_MULT_DIAG: 120 cusparseStruct->diagGPUMatFormat = format; 121 break; 122 case MAT_CUSPARSE_MULT_OFFDIAG: 123 cusparseStruct->offdiagGPUMatFormat = format; 124 break; 125 case MAT_CUSPARSE_ALL: 126 cusparseStruct->diagGPUMatFormat = format; 127 cusparseStruct->offdiagGPUMatFormat = format; 128 break; 129 default: 130 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 131 } 132 PetscFunctionReturn(0); 133 } 134 135 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 136 { 137 MatCUSPARSEStorageFormat format; 138 PetscErrorCode ierr; 139 PetscBool flg; 140 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 141 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 142 143 PetscFunctionBegin; 144 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 145 if (A->factortype==MAT_FACTOR_NONE) { 146 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 147 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 148 if (flg) { 149 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 150 } 151 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 152 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 153 if (flg) { 154 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 155 } 156 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 157 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 158 if (flg) { 159 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 160 } 161 } 162 ierr = PetscOptionsTail();CHKERRQ(ierr); 163 PetscFunctionReturn(0); 164 } 165 166 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 167 { 168 PetscErrorCode ierr; 169 Mat_MPIAIJ *mpiaij; 170 171 PetscFunctionBegin; 172 mpiaij = (Mat_MPIAIJ*)A->data; 173 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 174 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 175 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 176 } 177 PetscFunctionReturn(0); 178 } 179 180 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 181 { 182 PetscErrorCode ierr; 183 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 184 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 185 cudaError_t err; 186 cusparseStatus_t stat; 187 188 PetscFunctionBegin; 189 try { 190 if (a->A) { ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); } 191 if (a->B) { ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); } 192 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 193 if (cusparseStruct->stream) { 194 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 195 } 196 delete cusparseStruct; 197 } catch(char *ex) { 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 199 } 200 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 201 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 202 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 203 PetscFunctionReturn(0); 204 } 205 206 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 207 { 208 PetscErrorCode ierr; 209 Mat_MPIAIJ *a; 210 Mat_MPIAIJCUSPARSE *cusparseStruct; 211 cusparseStatus_t stat; 212 Mat A; 213 214 PetscFunctionBegin; 215 if (reuse == MAT_INITIAL_MATRIX) { 216 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 217 } else if (reuse == MAT_REUSE_MATRIX) { 218 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 219 } 220 A = *newmat; 221 222 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 223 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 224 225 a = (Mat_MPIAIJ*)A->data; 226 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 227 a->spptr = new Mat_MPIAIJCUSPARSE; 228 229 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 230 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 231 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 232 cusparseStruct->stream = 0; 233 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 234 } 235 236 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 237 A->ops->mult = MatMult_MPIAIJCUSPARSE; 238 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 239 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 240 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 241 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 242 243 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 244 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 245 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 246 PetscFunctionReturn(0); 247 } 248 249 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 250 { 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 255 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 256 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 257 PetscFunctionReturn(0); 258 } 259 260 /*@ 261 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 262 (the default parallel PETSc format). This matrix will ultimately pushed down 263 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 264 assembly performance the user should preallocate the matrix storage by setting 265 the parameter nz (or the array nnz). By setting these parameters accurately, 266 performance during matrix assembly can be increased by more than a factor of 50. 267 268 Collective 269 270 Input Parameters: 271 + comm - MPI communicator, set to PETSC_COMM_SELF 272 . m - number of rows 273 . n - number of columns 274 . nz - number of nonzeros per row (same for all rows) 275 - nnz - array containing the number of nonzeros in the various rows 276 (possibly different for each row) or NULL 277 278 Output Parameter: 279 . A - the matrix 280 281 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 282 MatXXXXSetPreallocation() paradigm instead of this routine directly. 283 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 284 285 Notes: 286 If nnz is given then nz is ignored 287 288 The AIJ format (also called the Yale sparse matrix format or 289 compressed row storage), is fully compatible with standard Fortran 77 290 storage. That is, the stored row and column indices can begin at 291 either one (as in Fortran) or zero. See the users' manual for details. 292 293 Specify the preallocated storage with either nz or nnz (not both). 294 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 295 allocation. For large problems you MUST preallocate memory or you 296 will get TERRIBLE performance, see the users' manual chapter on matrices. 297 298 By default, this format uses inodes (identical nodes) when possible, to 299 improve numerical efficiency of matrix-vector products and solves. We 300 search for consecutive rows with the same nonzero structure, thereby 301 reusing matrix information to achieve increased efficiency. 302 303 Level: intermediate 304 305 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 306 @*/ 307 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 308 { 309 PetscErrorCode ierr; 310 PetscMPIInt size; 311 312 PetscFunctionBegin; 313 ierr = MatCreate(comm,A);CHKERRQ(ierr); 314 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 315 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 316 if (size > 1) { 317 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 318 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 319 } else { 320 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 321 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 322 } 323 PetscFunctionReturn(0); 324 } 325 326 /*MC 327 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 328 329 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 330 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 331 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 332 333 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 334 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 335 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 336 for communicators controlling multiple processes. It is recommended that you call both of 337 the above preallocation routines for simplicity. 338 339 Options Database Keys: 340 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 341 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 342 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 343 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 344 345 Level: beginner 346 347 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 348 M 349 M*/ 350 #else 351 352 /* The following stubs are only provided to satisfy the linker */ 353 354 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 355 { 356 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!"); 357 } 358 #endif 359