1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 8 { 9 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 10 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 11 PetscErrorCode ierr; 12 PetscInt i; 13 14 PetscFunctionBegin; 15 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 16 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 17 if (d_nnz) { 18 for (i=0; i<B->rmap->n; i++) { 19 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 20 } 21 } 22 if (o_nnz) { 23 for (i=0; i<B->rmap->n; i++) { 24 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 25 } 26 } 27 if (!B->preallocated) { 28 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 29 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 30 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 31 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 32 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 33 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 35 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 37 } 38 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 39 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 40 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 41 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 42 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 46 47 B->preallocated = PETSC_TRUE; 48 PetscFunctionReturn(0); 49 } 50 51 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 52 { 53 /* This multiplication sequence is different sequence 54 than the CPU version. In particular, the diagonal block 55 multiplication kernel is launched in one stream. Then, 56 in a separate stream, the data transfers from DeviceToHost 57 (with MPI messaging in between), then HostToDevice are 58 launched. Once the data transfer stream is synchronized, 59 to ensure messaging is complete, the MatMultAdd kernel 60 is launched in the original (MatMult) stream to protect 61 against race conditions. 62 63 This sequence should only be called for GPU computation. */ 64 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt nt; 67 68 PetscFunctionBegin; 69 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 70 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 71 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 72 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 73 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 74 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 75 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 76 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 77 PetscFunctionReturn(0); 78 } 79 80 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 81 { 82 /* This multiplication sequence is different sequence 83 than the CPU version. In particular, the diagonal block 84 multiplication kernel is launched in one stream. Then, 85 in a separate stream, the data transfers from DeviceToHost 86 (with MPI messaging in between), then HostToDevice are 87 launched. Once the data transfer stream is synchronized, 88 to ensure messaging is complete, the MatMultAdd kernel 89 is launched in the original (MatMult) stream to protect 90 against race conditions. 91 92 This sequence should only be called for GPU computation. */ 93 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 94 PetscErrorCode ierr; 95 PetscInt nt; 96 97 PetscFunctionBegin; 98 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 99 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr); 100 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 101 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 102 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 103 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 104 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 105 PetscFunctionReturn(0); 106 } 107 108 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 109 { 110 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 111 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 112 113 PetscFunctionBegin; 114 switch (op) { 115 case MAT_CUSPARSE_MULT_DIAG: 116 cusparseStruct->diagGPUMatFormat = format; 117 break; 118 case MAT_CUSPARSE_MULT_OFFDIAG: 119 cusparseStruct->offdiagGPUMatFormat = format; 120 break; 121 case MAT_CUSPARSE_ALL: 122 cusparseStruct->diagGPUMatFormat = format; 123 cusparseStruct->offdiagGPUMatFormat = format; 124 break; 125 default: 126 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 127 } 128 PetscFunctionReturn(0); 129 } 130 131 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 132 { 133 MatCUSPARSEStorageFormat format; 134 PetscErrorCode ierr; 135 PetscBool flg; 136 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 137 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 138 139 PetscFunctionBegin; 140 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 141 if (A->factortype==MAT_FACTOR_NONE) { 142 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 143 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 144 if (flg) { 145 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 146 } 147 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 148 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 149 if (flg) { 150 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 151 } 152 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 153 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 154 if (flg) { 155 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 156 } 157 } 158 ierr = PetscOptionsTail();CHKERRQ(ierr); 159 PetscFunctionReturn(0); 160 } 161 162 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 163 { 164 PetscErrorCode ierr; 165 Mat_MPIAIJ *mpiaij; 166 167 PetscFunctionBegin; 168 mpiaij = (Mat_MPIAIJ*)A->data; 169 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 170 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 171 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 172 } 173 PetscFunctionReturn(0); 174 } 175 176 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 177 { 178 PetscErrorCode ierr; 179 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 180 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 181 cudaError_t err; 182 cusparseStatus_t stat; 183 184 PetscFunctionBegin; 185 try { 186 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 187 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 188 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 189 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 190 delete cusparseStruct; 191 } catch(char *ex) { 192 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 193 } 194 cusparseStruct = 0; 195 196 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 197 PetscFunctionReturn(0); 198 } 199 200 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 201 { 202 PetscErrorCode ierr; 203 Mat_MPIAIJ *a; 204 Mat_MPIAIJCUSPARSE * cusparseStruct; 205 cudaError_t err; 206 cusparseStatus_t stat; 207 208 PetscFunctionBegin; 209 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 210 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 211 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 212 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 213 214 a = (Mat_MPIAIJ*)A->data; 215 a->spptr = new Mat_MPIAIJCUSPARSE; 216 217 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 218 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 219 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 220 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 221 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 222 223 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 224 A->ops->mult = MatMult_MPIAIJCUSPARSE; 225 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 226 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 227 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 228 229 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 230 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 231 PetscFunctionReturn(0); 232 } 233 234 /*@ 235 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 236 (the default parallel PETSc format). This matrix will ultimately pushed down 237 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 238 assembly performance the user should preallocate the matrix storage by setting 239 the parameter nz (or the array nnz). By setting these parameters accurately, 240 performance during matrix assembly can be increased by more than a factor of 50. 241 242 Collective on MPI_Comm 243 244 Input Parameters: 245 + comm - MPI communicator, set to PETSC_COMM_SELF 246 . m - number of rows 247 . n - number of columns 248 . nz - number of nonzeros per row (same for all rows) 249 - nnz - array containing the number of nonzeros in the various rows 250 (possibly different for each row) or NULL 251 252 Output Parameter: 253 . A - the matrix 254 255 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 256 MatXXXXSetPreallocation() paradigm instead of this routine directly. 257 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 258 259 Notes: 260 If nnz is given then nz is ignored 261 262 The AIJ format (also called the Yale sparse matrix format or 263 compressed row storage), is fully compatible with standard Fortran 77 264 storage. That is, the stored row and column indices can begin at 265 either one (as in Fortran) or zero. See the users' manual for details. 266 267 Specify the preallocated storage with either nz or nnz (not both). 268 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 269 allocation. For large problems you MUST preallocate memory or you 270 will get TERRIBLE performance, see the users' manual chapter on matrices. 271 272 By default, this format uses inodes (identical nodes) when possible, to 273 improve numerical efficiency of matrix-vector products and solves. We 274 search for consecutive rows with the same nonzero structure, thereby 275 reusing matrix information to achieve increased efficiency. 276 277 Level: intermediate 278 279 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 280 @*/ 281 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 282 { 283 PetscErrorCode ierr; 284 PetscMPIInt size; 285 286 PetscFunctionBegin; 287 ierr = MatCreate(comm,A);CHKERRQ(ierr); 288 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 289 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 290 if (size > 1) { 291 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 292 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 293 } else { 294 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 295 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 296 } 297 PetscFunctionReturn(0); 298 } 299 300 /*MC 301 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 302 303 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 304 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 305 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 306 307 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 308 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 309 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 310 for communicators controlling multiple processes. It is recommended that you call both of 311 the above preallocation routines for simplicity. 312 313 Options Database Keys: 314 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 315 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 316 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 317 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 318 319 Level: beginner 320 321 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 322 M 323 M*/ 324