1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 35 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 37 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 40 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 42 } 43 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 44 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 57 { 58 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 59 PetscErrorCode ierr; 60 PetscInt nt; 61 62 PetscFunctionBegin; 63 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 64 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 65 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 66 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 67 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 68 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 69 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 70 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 75 { 76 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 77 PetscErrorCode ierr; 78 PetscInt nt; 79 80 PetscFunctionBegin; 81 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 82 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 83 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 84 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 85 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 86 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 88 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 89 PetscFunctionReturn(0); 90 } 91 92 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 93 { 94 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 95 PetscErrorCode ierr; 96 PetscInt nt; 97 98 PetscFunctionBegin; 99 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 100 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 101 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 102 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 103 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 104 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 105 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 106 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 107 PetscFunctionReturn(0); 108 } 109 110 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 111 { 112 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 113 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 114 115 PetscFunctionBegin; 116 switch (op) { 117 case MAT_CUSPARSE_MULT_DIAG: 118 cusparseStruct->diagGPUMatFormat = format; 119 break; 120 case MAT_CUSPARSE_MULT_OFFDIAG: 121 cusparseStruct->offdiagGPUMatFormat = format; 122 break; 123 case MAT_CUSPARSE_ALL: 124 cusparseStruct->diagGPUMatFormat = format; 125 cusparseStruct->offdiagGPUMatFormat = format; 126 break; 127 default: 128 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 129 } 130 PetscFunctionReturn(0); 131 } 132 133 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 134 { 135 MatCUSPARSEStorageFormat format; 136 PetscErrorCode ierr; 137 PetscBool flg; 138 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 139 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 140 141 PetscFunctionBegin; 142 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 143 if (A->factortype==MAT_FACTOR_NONE) { 144 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 145 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 146 if (flg) { 147 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 148 } 149 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 150 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 151 if (flg) { 152 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 153 } 154 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 155 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 156 if (flg) { 157 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 158 } 159 } 160 ierr = PetscOptionsTail();CHKERRQ(ierr); 161 PetscFunctionReturn(0); 162 } 163 164 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 165 { 166 PetscErrorCode ierr; 167 Mat_MPIAIJ *mpiaij; 168 169 PetscFunctionBegin; 170 mpiaij = (Mat_MPIAIJ*)A->data; 171 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 172 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 173 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 174 } 175 PetscFunctionReturn(0); 176 } 177 178 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 179 { 180 PetscErrorCode ierr; 181 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 182 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 183 cudaError_t err; 184 cusparseStatus_t stat; 185 186 PetscFunctionBegin; 187 try { 188 if (a->A) { ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); } 189 if (a->B) { ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); } 190 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 191 if (cusparseStruct->stream) { 192 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 193 } 194 delete cusparseStruct; 195 } catch(char *ex) { 196 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 197 } 198 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 199 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 200 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 201 PetscFunctionReturn(0); 202 } 203 204 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 205 { 206 PetscErrorCode ierr; 207 Mat_MPIAIJ *a; 208 Mat_MPIAIJCUSPARSE *cusparseStruct; 209 cusparseStatus_t stat; 210 Mat A; 211 212 PetscFunctionBegin; 213 if (reuse == MAT_INITIAL_MATRIX) { 214 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 215 } else if (reuse == MAT_REUSE_MATRIX) { 216 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 217 } 218 A = *newmat; 219 220 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 221 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 222 223 a = (Mat_MPIAIJ*)A->data; 224 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 225 a->spptr = new Mat_MPIAIJCUSPARSE; 226 227 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 228 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 229 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 230 cusparseStruct->stream = 0; 231 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 232 } 233 234 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 235 A->ops->mult = MatMult_MPIAIJCUSPARSE; 236 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 237 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 238 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 239 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 240 241 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 242 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 243 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 248 { 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 253 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 254 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 255 PetscFunctionReturn(0); 256 } 257 258 /*@ 259 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 260 (the default parallel PETSc format). This matrix will ultimately pushed down 261 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 262 assembly performance the user should preallocate the matrix storage by setting 263 the parameter nz (or the array nnz). By setting these parameters accurately, 264 performance during matrix assembly can be increased by more than a factor of 50. 265 266 Collective 267 268 Input Parameters: 269 + comm - MPI communicator, set to PETSC_COMM_SELF 270 . m - number of rows 271 . n - number of columns 272 . nz - number of nonzeros per row (same for all rows) 273 - nnz - array containing the number of nonzeros in the various rows 274 (possibly different for each row) or NULL 275 276 Output Parameter: 277 . A - the matrix 278 279 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 280 MatXXXXSetPreallocation() paradigm instead of this routine directly. 281 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 282 283 Notes: 284 If nnz is given then nz is ignored 285 286 The AIJ format (also called the Yale sparse matrix format or 287 compressed row storage), is fully compatible with standard Fortran 77 288 storage. That is, the stored row and column indices can begin at 289 either one (as in Fortran) or zero. See the users' manual for details. 290 291 Specify the preallocated storage with either nz or nnz (not both). 292 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 293 allocation. For large problems you MUST preallocate memory or you 294 will get TERRIBLE performance, see the users' manual chapter on matrices. 295 296 By default, this format uses inodes (identical nodes) when possible, to 297 improve numerical efficiency of matrix-vector products and solves. We 298 search for consecutive rows with the same nonzero structure, thereby 299 reusing matrix information to achieve increased efficiency. 300 301 Level: intermediate 302 303 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 304 @*/ 305 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 306 { 307 PetscErrorCode ierr; 308 PetscMPIInt size; 309 310 PetscFunctionBegin; 311 ierr = MatCreate(comm,A);CHKERRQ(ierr); 312 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 313 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 314 if (size > 1) { 315 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 316 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 317 } else { 318 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 319 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 320 } 321 PetscFunctionReturn(0); 322 } 323 324 /*MC 325 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 326 327 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 328 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 329 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 330 331 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 332 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 333 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 334 for communicators controlling multiple processes. It is recommended that you call both of 335 the above preallocation routines for simplicity. 336 337 Options Database Keys: 338 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 339 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 340 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 341 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 342 343 Level: beginner 344 345 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 346 M 347 M*/ 348