1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 8 { 9 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 10 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 11 PetscErrorCode ierr; 12 PetscInt i; 13 14 PetscFunctionBegin; 15 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 16 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 17 if (d_nnz) { 18 for (i=0; i<B->rmap->n; i++) { 19 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 20 } 21 } 22 if (o_nnz) { 23 for (i=0; i<B->rmap->n; i++) { 24 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 25 } 26 } 27 if (!B->preallocated) { 28 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 29 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 30 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 31 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 32 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 33 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 35 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 37 } 38 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 39 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 40 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 41 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 42 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 46 47 B->preallocated = PETSC_TRUE; 48 PetscFunctionReturn(0); 49 } 50 51 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 52 { 53 PetscErrorCode ierr; 54 PetscInt rbs,cbs; 55 56 PetscFunctionBegin; 57 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 58 if (right) { 59 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 60 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 61 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 62 ierr = VecSetType(*right,VECCUDA);CHKERRQ(ierr); 63 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 64 } 65 if (left) { 66 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 67 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 68 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 69 ierr = VecSetType(*left,VECCUDA);CHKERRQ(ierr); 70 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 71 72 73 } 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 78 { 79 /* This multiplication sequence is different sequence 80 than the CPU version. In particular, the diagonal block 81 multiplication kernel is launched in one stream. Then, 82 in a separate stream, the data transfers from DeviceToHost 83 (with MPI messaging in between), then HostToDevice are 84 launched. Once the data transfer stream is synchronized, 85 to ensure messaging is complete, the MatMultAdd kernel 86 is launched in the original (MatMult) stream to protect 87 against race conditions. 88 89 This sequence should only be called for GPU computation. */ 90 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 91 PetscErrorCode ierr; 92 PetscInt nt; 93 94 PetscFunctionBegin; 95 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 96 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 97 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 98 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 99 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 100 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 101 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 102 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 103 PetscFunctionReturn(0); 104 } 105 106 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 107 { 108 /* This multiplication sequence is different sequence 109 than the CPU version. In particular, the diagonal block 110 multiplication kernel is launched in one stream. Then, 111 in a separate stream, the data transfers from DeviceToHost 112 (with MPI messaging in between), then HostToDevice are 113 launched. Once the data transfer stream is synchronized, 114 to ensure messaging is complete, the MatMultAdd kernel 115 is launched in the original (MatMult) stream to protect 116 against race conditions. 117 118 This sequence should only be called for GPU computation. */ 119 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 120 PetscErrorCode ierr; 121 PetscInt nt; 122 123 PetscFunctionBegin; 124 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 125 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 126 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr); 127 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 128 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 129 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 130 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 131 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 132 PetscFunctionReturn(0); 133 } 134 135 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 136 { 137 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 138 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 139 140 PetscFunctionBegin; 141 switch (op) { 142 case MAT_CUSPARSE_MULT_DIAG: 143 cusparseStruct->diagGPUMatFormat = format; 144 break; 145 case MAT_CUSPARSE_MULT_OFFDIAG: 146 cusparseStruct->offdiagGPUMatFormat = format; 147 break; 148 case MAT_CUSPARSE_ALL: 149 cusparseStruct->diagGPUMatFormat = format; 150 cusparseStruct->offdiagGPUMatFormat = format; 151 break; 152 default: 153 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 154 } 155 PetscFunctionReturn(0); 156 } 157 158 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 159 { 160 MatCUSPARSEStorageFormat format; 161 PetscErrorCode ierr; 162 PetscBool flg; 163 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 164 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 165 166 PetscFunctionBegin; 167 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 168 ierr = PetscObjectOptionsBegin((PetscObject)A); 169 if (A->factortype==MAT_FACTOR_NONE) { 170 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 171 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 172 if (flg) { 173 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 174 } 175 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 176 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 177 if (flg) { 178 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 179 } 180 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 181 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182 if (flg) { 183 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 184 } 185 } 186 ierr = PetscOptionsEnd();CHKERRQ(ierr); 187 PetscFunctionReturn(0); 188 } 189 190 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 191 { 192 PetscErrorCode ierr; 193 Mat_MPIAIJ *mpiaij; 194 195 PetscFunctionBegin; 196 mpiaij = (Mat_MPIAIJ*)A->data; 197 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 198 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 199 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 200 } 201 PetscFunctionReturn(0); 202 } 203 204 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 205 { 206 PetscErrorCode ierr; 207 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 208 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 209 cudaError_t err; 210 cusparseStatus_t stat; 211 212 PetscFunctionBegin; 213 try { 214 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 215 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 216 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 217 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 218 delete cusparseStruct; 219 } catch(char *ex) { 220 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 221 } 222 cusparseStruct = 0; 223 224 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 229 { 230 PetscErrorCode ierr; 231 Mat_MPIAIJ *a; 232 Mat_MPIAIJCUSPARSE * cusparseStruct; 233 cudaError_t err; 234 cusparseStatus_t stat; 235 236 PetscFunctionBegin; 237 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 238 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 239 a = (Mat_MPIAIJ*)A->data; 240 a->spptr = new Mat_MPIAIJCUSPARSE; 241 242 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 243 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 244 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 245 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 246 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 247 248 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 249 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 250 A->ops->mult = MatMult_MPIAIJCUSPARSE; 251 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 252 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 253 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 254 255 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 256 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 257 PetscFunctionReturn(0); 258 } 259 260 /*@ 261 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 262 (the default parallel PETSc format). This matrix will ultimately pushed down 263 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 264 assembly performance the user should preallocate the matrix storage by setting 265 the parameter nz (or the array nnz). By setting these parameters accurately, 266 performance during matrix assembly can be increased by more than a factor of 50. 267 268 Collective on MPI_Comm 269 270 Input Parameters: 271 + comm - MPI communicator, set to PETSC_COMM_SELF 272 . m - number of rows 273 . n - number of columns 274 . nz - number of nonzeros per row (same for all rows) 275 - nnz - array containing the number of nonzeros in the various rows 276 (possibly different for each row) or NULL 277 278 Output Parameter: 279 . A - the matrix 280 281 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 282 MatXXXXSetPreallocation() paradigm instead of this routine directly. 283 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 284 285 Notes: 286 If nnz is given then nz is ignored 287 288 The AIJ format (also called the Yale sparse matrix format or 289 compressed row storage), is fully compatible with standard Fortran 77 290 storage. That is, the stored row and column indices can begin at 291 either one (as in Fortran) or zero. See the users' manual for details. 292 293 Specify the preallocated storage with either nz or nnz (not both). 294 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 295 allocation. For large problems you MUST preallocate memory or you 296 will get TERRIBLE performance, see the users' manual chapter on matrices. 297 298 By default, this format uses inodes (identical nodes) when possible, to 299 improve numerical efficiency of matrix-vector products and solves. We 300 search for consecutive rows with the same nonzero structure, thereby 301 reusing matrix information to achieve increased efficiency. 302 303 Level: intermediate 304 305 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 306 @*/ 307 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 308 { 309 PetscErrorCode ierr; 310 PetscMPIInt size; 311 312 PetscFunctionBegin; 313 ierr = MatCreate(comm,A);CHKERRQ(ierr); 314 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 315 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 316 if (size > 1) { 317 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 318 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 319 } else { 320 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 321 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 322 } 323 PetscFunctionReturn(0); 324 } 325 326 /*MC 327 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 328 329 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 330 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 331 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 332 333 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 334 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 335 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 336 for communicators controlling multiple processes. It is recommended that you call both of 337 the above preallocation routines for simplicity. 338 339 Options Database Keys: 340 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 341 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 342 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 343 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 344 345 Level: beginner 346 347 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 348 M 349 M*/ 350