1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 8 { 9 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 10 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 11 PetscErrorCode ierr; 12 PetscInt i; 13 14 PetscFunctionBegin; 15 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 16 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 17 if (d_nnz) { 18 for (i=0; i<B->rmap->n; i++) { 19 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 20 } 21 } 22 if (o_nnz) { 23 for (i=0; i<B->rmap->n; i++) { 24 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 25 } 26 } 27 if (!B->preallocated) { 28 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 29 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 30 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 31 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 32 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 33 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 35 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 37 } 38 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 39 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 40 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 41 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 42 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 46 47 B->preallocated = PETSC_TRUE; 48 PetscFunctionReturn(0); 49 } 50 51 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 52 { 53 /* This multiplication sequence is different sequence 54 than the CPU version. In particular, the diagonal block 55 multiplication kernel is launched in one stream. Then, 56 in a separate stream, the data transfers from DeviceToHost 57 (with MPI messaging in between), then HostToDevice are 58 launched. Once the data transfer stream is synchronized, 59 to ensure messaging is complete, the MatMultAdd kernel 60 is launched in the original (MatMult) stream to protect 61 against race conditions. 62 63 This sequence should only be called for GPU computation. */ 64 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt nt; 67 68 PetscFunctionBegin; 69 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 70 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 71 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 72 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 73 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 74 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 75 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 76 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 77 PetscFunctionReturn(0); 78 } 79 80 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 81 { 82 /* This multiplication sequence is different sequence 83 than the CPU version. In particular, the diagonal block 84 multiplication kernel is launched in one stream. Then, 85 in a separate stream, the data transfers from DeviceToHost 86 (with MPI messaging in between), then HostToDevice are 87 launched. Once the data transfer stream is synchronized, 88 to ensure messaging is complete, the MatMultAdd kernel 89 is launched in the original (MatMult) stream to protect 90 against race conditions. 91 92 This sequence should only be called for GPU computation. */ 93 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 94 PetscErrorCode ierr; 95 PetscInt nt; 96 97 PetscFunctionBegin; 98 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 99 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 100 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_REVERSE);CHKERRQ(ierr); 101 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 102 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 103 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 104 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 105 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 110 { 111 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 112 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 113 114 PetscFunctionBegin; 115 switch (op) { 116 case MAT_CUSPARSE_MULT_DIAG: 117 cusparseStruct->diagGPUMatFormat = format; 118 break; 119 case MAT_CUSPARSE_MULT_OFFDIAG: 120 cusparseStruct->offdiagGPUMatFormat = format; 121 break; 122 case MAT_CUSPARSE_ALL: 123 cusparseStruct->diagGPUMatFormat = format; 124 cusparseStruct->offdiagGPUMatFormat = format; 125 break; 126 default: 127 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 128 } 129 PetscFunctionReturn(0); 130 } 131 132 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 133 { 134 MatCUSPARSEStorageFormat format; 135 PetscErrorCode ierr; 136 PetscBool flg; 137 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 138 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 139 140 PetscFunctionBegin; 141 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 142 if (A->factortype==MAT_FACTOR_NONE) { 143 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 144 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 145 if (flg) { 146 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 147 } 148 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 149 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 150 if (flg) { 151 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 152 } 153 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 154 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 155 if (flg) { 156 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 157 } 158 } 159 ierr = PetscOptionsTail();CHKERRQ(ierr); 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 164 { 165 PetscErrorCode ierr; 166 Mat_MPIAIJ *mpiaij; 167 168 PetscFunctionBegin; 169 mpiaij = (Mat_MPIAIJ*)A->data; 170 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 171 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 172 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 173 } 174 PetscFunctionReturn(0); 175 } 176 177 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 178 { 179 PetscErrorCode ierr; 180 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 181 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 182 cudaError_t err; 183 cusparseStatus_t stat; 184 185 PetscFunctionBegin; 186 try { 187 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 188 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 189 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 190 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 191 delete cusparseStruct; 192 } catch(char *ex) { 193 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 194 } 195 cusparseStruct = 0; 196 197 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 198 PetscFunctionReturn(0); 199 } 200 201 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 202 { 203 PetscErrorCode ierr; 204 Mat_MPIAIJ *a; 205 Mat_MPIAIJCUSPARSE * cusparseStruct; 206 cudaError_t err; 207 cusparseStatus_t stat; 208 209 PetscFunctionBegin; 210 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 211 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 212 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 213 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 214 215 a = (Mat_MPIAIJ*)A->data; 216 a->spptr = new Mat_MPIAIJCUSPARSE; 217 218 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 219 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 220 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 221 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 222 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 223 224 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 225 A->ops->mult = MatMult_MPIAIJCUSPARSE; 226 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 227 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 228 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 229 230 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 231 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 232 PetscFunctionReturn(0); 233 } 234 235 /*@ 236 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 237 (the default parallel PETSc format). This matrix will ultimately pushed down 238 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 239 assembly performance the user should preallocate the matrix storage by setting 240 the parameter nz (or the array nnz). By setting these parameters accurately, 241 performance during matrix assembly can be increased by more than a factor of 50. 242 243 Collective on MPI_Comm 244 245 Input Parameters: 246 + comm - MPI communicator, set to PETSC_COMM_SELF 247 . m - number of rows 248 . n - number of columns 249 . nz - number of nonzeros per row (same for all rows) 250 - nnz - array containing the number of nonzeros in the various rows 251 (possibly different for each row) or NULL 252 253 Output Parameter: 254 . A - the matrix 255 256 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 257 MatXXXXSetPreallocation() paradigm instead of this routine directly. 258 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 259 260 Notes: 261 If nnz is given then nz is ignored 262 263 The AIJ format (also called the Yale sparse matrix format or 264 compressed row storage), is fully compatible with standard Fortran 77 265 storage. That is, the stored row and column indices can begin at 266 either one (as in Fortran) or zero. See the users' manual for details. 267 268 Specify the preallocated storage with either nz or nnz (not both). 269 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 270 allocation. For large problems you MUST preallocate memory or you 271 will get TERRIBLE performance, see the users' manual chapter on matrices. 272 273 By default, this format uses inodes (identical nodes) when possible, to 274 improve numerical efficiency of matrix-vector products and solves. We 275 search for consecutive rows with the same nonzero structure, thereby 276 reusing matrix information to achieve increased efficiency. 277 278 Level: intermediate 279 280 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 281 @*/ 282 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 283 { 284 PetscErrorCode ierr; 285 PetscMPIInt size; 286 287 PetscFunctionBegin; 288 ierr = MatCreate(comm,A);CHKERRQ(ierr); 289 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 290 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 291 if (size > 1) { 292 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 293 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 294 } else { 295 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 296 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 297 } 298 PetscFunctionReturn(0); 299 } 300 301 /*MC 302 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 303 304 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 305 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 306 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 307 308 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 309 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 310 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 311 for communicators controlling multiple processes. It is recommended that you call both of 312 the above preallocation routines for simplicity. 313 314 Options Database Keys: 315 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 316 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 317 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 318 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 319 320 Level: beginner 321 322 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 323 M 324 M*/ 325