1 #define PETSC_SKIP_COMPLEX 2 3 #include <petscconf.h> 4 PETSC_CUDA_EXTERN_C_BEGIN 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 PETSC_CUDA_EXTERN_C_END 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 12 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 13 { 14 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 15 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 16 PetscErrorCode ierr; 17 PetscInt i; 18 19 PetscFunctionBegin; 20 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 21 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 22 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 23 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 24 25 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 26 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 27 if (d_nnz) { 28 for (i=0; i<B->rmap->n; i++) { 29 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 30 } 31 } 32 if (o_nnz) { 33 for (i=0; i<B->rmap->n; i++) { 34 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 35 } 36 } 37 if (!B->preallocated) { 38 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 39 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 40 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 41 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 42 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 43 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 44 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 45 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 46 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 47 } 48 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 49 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 53 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 54 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 55 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 56 57 B->preallocated = PETSC_TRUE; 58 PetscFunctionReturn(0); 59 } 60 61 #undef __FUNCT__ 62 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE" 63 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 64 { 65 PetscErrorCode ierr; 66 PetscInt rbs,cbs; 67 68 PetscFunctionBegin; 69 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 70 if (right) { 71 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 72 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 73 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 74 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 75 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 76 } 77 if (left) { 78 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 79 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 80 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 81 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 82 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 83 84 85 } 86 PetscFunctionReturn(0); 87 } 88 89 90 #undef __FUNCT__ 91 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 92 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 93 { 94 /* This multiplication sequence is different sequence 95 than the CPU version. In particular, the diagonal block 96 multiplication kernel is launched in one stream. Then, 97 in a separate stream, the data transfers from DeviceToHost 98 (with MPI messaging in between), then HostToDevice are 99 launched. Once the data transfer stream is synchronized, 100 to ensure messaging is complete, the MatMultAdd kernel 101 is launched in the original (MatMult) stream to protect 102 against race conditions. 103 104 This sequence should only be called for GPU computation. */ 105 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 106 PetscErrorCode ierr; 107 PetscInt nt; 108 109 PetscFunctionBegin; 110 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 111 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 112 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 113 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 114 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 115 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 116 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 117 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 118 PetscFunctionReturn(0); 119 } 120 121 #undef __FUNCT__ 122 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 123 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 124 { 125 /* This multiplication sequence is different sequence 126 than the CPU version. In particular, the diagonal block 127 multiplication kernel is launched in one stream. Then, 128 in a separate stream, the data transfers from DeviceToHost 129 (with MPI messaging in between), then HostToDevice are 130 launched. Once the data transfer stream is synchronized, 131 to ensure messaging is complete, the MatMultAdd kernel 132 is launched in the original (MatMult) stream to protect 133 against race conditions. 134 135 This sequence should only be called for GPU computation. */ 136 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 137 PetscErrorCode ierr; 138 PetscInt nt; 139 140 PetscFunctionBegin; 141 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 142 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 143 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 144 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 145 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 146 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 147 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 148 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 149 PetscFunctionReturn(0); 150 } 151 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 154 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 155 { 156 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 157 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 158 159 PetscFunctionBegin; 160 switch (op) { 161 case MAT_CUSPARSE_MULT_DIAG: 162 cusparseStruct->diagGPUMatFormat = format; 163 break; 164 case MAT_CUSPARSE_MULT_OFFDIAG: 165 cusparseStruct->offdiagGPUMatFormat = format; 166 break; 167 case MAT_CUSPARSE_ALL: 168 cusparseStruct->diagGPUMatFormat = format; 169 cusparseStruct->offdiagGPUMatFormat = format; 170 break; 171 default: 172 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 173 } 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 179 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 180 { 181 MatCUSPARSEStorageFormat format; 182 PetscErrorCode ierr; 183 PetscBool flg; 184 185 PetscFunctionBegin; 186 ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 187 ierr = PetscObjectOptionsBegin((PetscObject)A); 188 if (A->factortype==MAT_FACTOR_NONE) { 189 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 190 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 191 if (flg) { 192 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 193 } 194 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 195 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196 if (flg) { 197 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 198 } 199 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 200 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 201 if (flg) { 202 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 203 } 204 } 205 ierr = PetscOptionsEnd();CHKERRQ(ierr); 206 PetscFunctionReturn(0); 207 } 208 209 #undef __FUNCT__ 210 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 211 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 212 { 213 PetscErrorCode ierr; 214 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 215 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 216 cudaError_t err; 217 cusparseStatus_t stat; 218 219 PetscFunctionBegin; 220 try { 221 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 222 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 223 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat); 224 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err); 225 delete cusparseStruct; 226 } catch(char *ex) { 227 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 228 } 229 cusparseStruct = 0; 230 231 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 237 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 238 { 239 PetscErrorCode ierr; 240 Mat_MPIAIJ *a; 241 Mat_MPIAIJCUSPARSE * cusparseStruct; 242 cudaError_t err; 243 cusparseStatus_t stat; 244 245 PetscFunctionBegin; 246 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 247 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 248 a = (Mat_MPIAIJ*)A->data; 249 a->spptr = new Mat_MPIAIJCUSPARSE; 250 251 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 252 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 253 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 254 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat); 255 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err); 256 257 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 258 A->ops->mult = MatMult_MPIAIJCUSPARSE; 259 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 260 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 261 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 262 263 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 264 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 265 PetscFunctionReturn(0); 266 } 267 268 /*@ 269 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 270 (the default parallel PETSc format). This matrix will ultimately pushed down 271 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 272 assembly performance the user should preallocate the matrix storage by setting 273 the parameter nz (or the array nnz). By setting these parameters accurately, 274 performance during matrix assembly can be increased by more than a factor of 50. 275 276 Collective on MPI_Comm 277 278 Input Parameters: 279 + comm - MPI communicator, set to PETSC_COMM_SELF 280 . m - number of rows 281 . n - number of columns 282 . nz - number of nonzeros per row (same for all rows) 283 - nnz - array containing the number of nonzeros in the various rows 284 (possibly different for each row) or NULL 285 286 Output Parameter: 287 . A - the matrix 288 289 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 290 MatXXXXSetPreallocation() paradigm instead of this routine directly. 291 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 292 293 Notes: 294 If nnz is given then nz is ignored 295 296 The AIJ format (also called the Yale sparse matrix format or 297 compressed row storage), is fully compatible with standard Fortran 77 298 storage. That is, the stored row and column indices can begin at 299 either one (as in Fortran) or zero. See the users' manual for details. 300 301 Specify the preallocated storage with either nz or nnz (not both). 302 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 303 allocation. For large problems you MUST preallocate memory or you 304 will get TERRIBLE performance, see the users' manual chapter on matrices. 305 306 By default, this format uses inodes (identical nodes) when possible, to 307 improve numerical efficiency of matrix-vector products and solves. We 308 search for consecutive rows with the same nonzero structure, thereby 309 reusing matrix information to achieve increased efficiency. 310 311 Level: intermediate 312 313 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 314 @*/ 315 #undef __FUNCT__ 316 #define __FUNCT__ "MatCreateAIJCUSPARSE" 317 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 318 { 319 PetscErrorCode ierr; 320 PetscMPIInt size; 321 322 PetscFunctionBegin; 323 ierr = MatCreate(comm,A);CHKERRQ(ierr); 324 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 325 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 326 if (size > 1) { 327 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 328 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 329 } else { 330 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 331 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 332 } 333 PetscFunctionReturn(0); 334 } 335 336 /*M 337 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 338 339 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 340 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 341 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 342 343 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 344 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 345 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 346 for communicators controlling multiple processes. It is recommended that you call both of 347 the above preallocation routines for simplicity. 348 349 Options Database Keys: 350 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 351 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 352 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 353 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 354 355 Level: beginner 356 357 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 358 M 359 M*/ 360