1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 35 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 37 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 40 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 42 } 43 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 44 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 57 { 58 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 59 PetscErrorCode ierr; 60 PetscInt nt; 61 62 PetscFunctionBegin; 63 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 64 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 65 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 66 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 67 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 68 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 69 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 70 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 75 { 76 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 77 PetscErrorCode ierr; 78 PetscInt nt; 79 80 PetscFunctionBegin; 81 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 82 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 83 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 84 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 85 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 86 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 87 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 88 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 89 PetscFunctionReturn(0); 90 } 91 92 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 93 { 94 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 95 PetscErrorCode ierr; 96 PetscInt nt; 97 98 PetscFunctionBegin; 99 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 100 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 101 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 102 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 103 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 104 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 105 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 106 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 107 PetscFunctionReturn(0); 108 } 109 110 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 111 { 112 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 113 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 114 115 PetscFunctionBegin; 116 switch (op) { 117 case MAT_CUSPARSE_MULT_DIAG: 118 cusparseStruct->diagGPUMatFormat = format; 119 break; 120 case MAT_CUSPARSE_MULT_OFFDIAG: 121 cusparseStruct->offdiagGPUMatFormat = format; 122 break; 123 case MAT_CUSPARSE_ALL: 124 cusparseStruct->diagGPUMatFormat = format; 125 cusparseStruct->offdiagGPUMatFormat = format; 126 break; 127 default: 128 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 129 } 130 PetscFunctionReturn(0); 131 } 132 133 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 134 { 135 MatCUSPARSEStorageFormat format; 136 PetscErrorCode ierr; 137 PetscBool flg; 138 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 139 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 140 141 PetscFunctionBegin; 142 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 143 if (A->factortype==MAT_FACTOR_NONE) { 144 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 145 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 146 if (flg) { 147 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 148 } 149 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 150 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 151 if (flg) { 152 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 153 } 154 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 155 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 156 if (flg) { 157 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 158 } 159 } 160 ierr = PetscOptionsTail();CHKERRQ(ierr); 161 PetscFunctionReturn(0); 162 } 163 164 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 165 { 166 PetscErrorCode ierr; 167 Mat_MPIAIJ *mpiaij; 168 169 PetscFunctionBegin; 170 mpiaij = (Mat_MPIAIJ*)A->data; 171 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 172 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 173 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 174 } 175 PetscFunctionReturn(0); 176 } 177 178 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 179 { 180 PetscErrorCode ierr; 181 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 182 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 183 cudaError_t err; 184 cusparseStatus_t stat; 185 186 PetscFunctionBegin; 187 try { 188 if (a->A) { ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); } 189 if (a->B) { ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); } 190 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 191 if (cusparseStruct->stream) { 192 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 193 } 194 delete cusparseStruct; 195 } catch(char *ex) { 196 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 197 } 198 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 199 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 200 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 201 PetscFunctionReturn(0); 202 } 203 204 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 205 { 206 PetscErrorCode ierr; 207 Mat_MPIAIJ *a; 208 Mat_MPIAIJCUSPARSE *cusparseStruct; 209 cusparseStatus_t stat; 210 Mat A; 211 212 PetscFunctionBegin; 213 if (reuse == MAT_INITIAL_MATRIX) { 214 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 215 } else if (reuse == MAT_REUSE_MATRIX) { 216 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 217 } 218 A = *newmat; 219 220 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 221 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 222 223 a = (Mat_MPIAIJ*)A->data; 224 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 225 a->spptr = new Mat_MPIAIJCUSPARSE; 226 227 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 228 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 229 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 230 cusparseStruct->stream = 0; 231 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 232 } 233 234 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 235 A->ops->mult = MatMult_MPIAIJCUSPARSE; 236 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 237 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 238 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 239 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 240 241 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 242 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 243 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 248 { 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 253 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 254 PetscFunctionReturn(0); 255 } 256 257 /*@ 258 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 259 (the default parallel PETSc format). This matrix will ultimately pushed down 260 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 261 assembly performance the user should preallocate the matrix storage by setting 262 the parameter nz (or the array nnz). By setting these parameters accurately, 263 performance during matrix assembly can be increased by more than a factor of 50. 264 265 Collective 266 267 Input Parameters: 268 + comm - MPI communicator, set to PETSC_COMM_SELF 269 . m - number of rows 270 . n - number of columns 271 . nz - number of nonzeros per row (same for all rows) 272 - nnz - array containing the number of nonzeros in the various rows 273 (possibly different for each row) or NULL 274 275 Output Parameter: 276 . A - the matrix 277 278 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 279 MatXXXXSetPreallocation() paradigm instead of this routine directly. 280 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 281 282 Notes: 283 If nnz is given then nz is ignored 284 285 The AIJ format (also called the Yale sparse matrix format or 286 compressed row storage), is fully compatible with standard Fortran 77 287 storage. That is, the stored row and column indices can begin at 288 either one (as in Fortran) or zero. See the users' manual for details. 289 290 Specify the preallocated storage with either nz or nnz (not both). 291 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 292 allocation. For large problems you MUST preallocate memory or you 293 will get TERRIBLE performance, see the users' manual chapter on matrices. 294 295 By default, this format uses inodes (identical nodes) when possible, to 296 improve numerical efficiency of matrix-vector products and solves. We 297 search for consecutive rows with the same nonzero structure, thereby 298 reusing matrix information to achieve increased efficiency. 299 300 Level: intermediate 301 302 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 303 @*/ 304 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 305 { 306 PetscErrorCode ierr; 307 PetscMPIInt size; 308 309 PetscFunctionBegin; 310 ierr = MatCreate(comm,A);CHKERRQ(ierr); 311 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 312 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 313 if (size > 1) { 314 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 315 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 316 } else { 317 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 318 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 319 } 320 PetscFunctionReturn(0); 321 } 322 323 /*MC 324 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 325 326 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 327 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 328 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 329 330 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 331 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 332 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 333 for communicators controlling multiple processes. It is recommended that you call both of 334 the above preallocation routines for simplicity. 335 336 Options Database Keys: 337 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 338 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 339 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 340 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 341 342 Level: beginner 343 344 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 345 M 346 M*/ 347