1 #define PETSC_SKIP_SPINLOCK 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 6 7 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 8 { 9 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 10 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 11 PetscErrorCode ierr; 12 PetscInt i; 13 14 PetscFunctionBegin; 15 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 16 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 17 if (d_nnz) { 18 for (i=0; i<B->rmap->n; i++) { 19 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 20 } 21 } 22 if (o_nnz) { 23 for (i=0; i<B->rmap->n; i++) { 24 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 25 } 26 } 27 if (!B->preallocated) { 28 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 29 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 30 ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr); 31 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 32 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 33 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 34 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 35 ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr); 36 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 37 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 38 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 39 } 40 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 41 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 42 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 48 49 B->preallocated = PETSC_TRUE; 50 PetscFunctionReturn(0); 51 } 52 53 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 54 { 55 /* 56 This multiplication sequence is different sequence 57 than the CPU version. In particular, the diagonal block 58 multiplication kernel is launched in one stream. Then, 59 in a separate stream, the data transfers from DeviceToHost 60 (with MPI messaging in between), then HostToDevice are 61 launched. Once the data transfer stream is synchronized, 62 to ensure messaging is complete, the MatMultAdd kernel 63 is launched in the original (MatMult) stream to protect 64 against race conditions. 65 */ 66 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 67 PetscErrorCode ierr; 68 PetscInt nt; 69 70 PetscFunctionBegin; 71 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 72 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 73 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 74 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 75 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 76 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 77 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 78 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 79 PetscFunctionReturn(0); 80 } 81 82 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 83 { 84 /* 85 This multiplication sequence is different sequence 86 than the CPU version. In particular, the diagonal block 87 multiplication kernel is launched in one stream. Then, 88 in a separate stream, the data transfers from DeviceToHost 89 (with MPI messaging in between), then HostToDevice are 90 launched. Once the data transfer stream is synchronized, 91 to ensure messaging is complete, the MatMultAdd kernel 92 is launched in the original (MatMult) stream to protect 93 against race conditions. 94 */ 95 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 96 PetscErrorCode ierr; 97 PetscInt nt; 98 99 PetscFunctionBegin; 100 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 101 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 102 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 103 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 104 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 105 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 107 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 111 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 112 { 113 /* This multiplication sequence is different sequence 114 than the CPU version. In particular, the diagonal block 115 multiplication kernel is launched in one stream. Then, 116 in a separate stream, the data transfers from DeviceToHost 117 (with MPI messaging in between), then HostToDevice are 118 launched. Once the data transfer stream is synchronized, 119 to ensure messaging is complete, the MatMultAdd kernel 120 is launched in the original (MatMult) stream to protect 121 against race conditions. 122 123 This sequence should only be called for GPU computation. */ 124 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 125 PetscErrorCode ierr; 126 PetscInt nt; 127 128 PetscFunctionBegin; 129 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 130 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 131 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 132 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 133 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 134 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 135 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 136 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 137 PetscFunctionReturn(0); 138 } 139 140 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 141 { 142 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 143 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 144 145 PetscFunctionBegin; 146 switch (op) { 147 case MAT_CUSPARSE_MULT_DIAG: 148 cusparseStruct->diagGPUMatFormat = format; 149 break; 150 case MAT_CUSPARSE_MULT_OFFDIAG: 151 cusparseStruct->offdiagGPUMatFormat = format; 152 break; 153 case MAT_CUSPARSE_ALL: 154 cusparseStruct->diagGPUMatFormat = format; 155 cusparseStruct->offdiagGPUMatFormat = format; 156 break; 157 default: 158 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 159 } 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 164 { 165 MatCUSPARSEStorageFormat format; 166 PetscErrorCode ierr; 167 PetscBool flg; 168 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 169 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 170 171 PetscFunctionBegin; 172 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 173 if (A->factortype==MAT_FACTOR_NONE) { 174 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 175 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 176 if (flg) { 177 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 178 } 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 183 } 184 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 185 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 186 if (flg) { 187 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 188 } 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 195 { 196 PetscErrorCode ierr; 197 Mat_MPIAIJ *mpiaij; 198 199 PetscFunctionBegin; 200 mpiaij = (Mat_MPIAIJ*)A->data; 201 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 202 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 203 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 204 } 205 PetscFunctionReturn(0); 206 } 207 208 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 209 { 210 PetscErrorCode ierr; 211 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 212 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 213 cudaError_t err; 214 cusparseStatus_t stat; 215 216 PetscFunctionBegin; 217 try { 218 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 219 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 220 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 221 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 222 delete cusparseStruct; 223 } catch(char *ex) { 224 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 225 } 226 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 231 { 232 PetscErrorCode ierr; 233 Mat_MPIAIJ *a; 234 Mat_MPIAIJCUSPARSE * cusparseStruct; 235 cudaError_t err; 236 cusparseStatus_t stat; 237 238 PetscFunctionBegin; 239 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 240 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 241 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 242 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 243 244 a = (Mat_MPIAIJ*)A->data; 245 a->spptr = new Mat_MPIAIJCUSPARSE; 246 247 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 248 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 249 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 250 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 251 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 252 253 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 254 A->ops->mult = MatMult_MPIAIJCUSPARSE; 255 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 256 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 257 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 258 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 259 260 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 261 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 262 PetscFunctionReturn(0); 263 } 264 265 /*@ 266 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 267 (the default parallel PETSc format). This matrix will ultimately pushed down 268 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 269 assembly performance the user should preallocate the matrix storage by setting 270 the parameter nz (or the array nnz). By setting these parameters accurately, 271 performance during matrix assembly can be increased by more than a factor of 50. 272 273 Collective 274 275 Input Parameters: 276 + comm - MPI communicator, set to PETSC_COMM_SELF 277 . m - number of rows 278 . n - number of columns 279 . nz - number of nonzeros per row (same for all rows) 280 - nnz - array containing the number of nonzeros in the various rows 281 (possibly different for each row) or NULL 282 283 Output Parameter: 284 . A - the matrix 285 286 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 287 MatXXXXSetPreallocation() paradigm instead of this routine directly. 288 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 289 290 Notes: 291 If nnz is given then nz is ignored 292 293 The AIJ format (also called the Yale sparse matrix format or 294 compressed row storage), is fully compatible with standard Fortran 77 295 storage. That is, the stored row and column indices can begin at 296 either one (as in Fortran) or zero. See the users' manual for details. 297 298 Specify the preallocated storage with either nz or nnz (not both). 299 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 300 allocation. For large problems you MUST preallocate memory or you 301 will get TERRIBLE performance, see the users' manual chapter on matrices. 302 303 By default, this format uses inodes (identical nodes) when possible, to 304 improve numerical efficiency of matrix-vector products and solves. We 305 search for consecutive rows with the same nonzero structure, thereby 306 reusing matrix information to achieve increased efficiency. 307 308 Level: intermediate 309 310 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 311 @*/ 312 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 313 { 314 PetscErrorCode ierr; 315 PetscMPIInt size; 316 317 PetscFunctionBegin; 318 ierr = MatCreate(comm,A);CHKERRQ(ierr); 319 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 320 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 321 if (size > 1) { 322 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 323 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 324 } else { 325 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 326 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 327 } 328 PetscFunctionReturn(0); 329 } 330 331 /*MC 332 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 333 334 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 335 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 336 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 337 338 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 339 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 340 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 341 for communicators controlling multiple processes. It is recommended that you call both of 342 the above preallocation routines for simplicity. 343 344 Options Database Keys: 345 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 346 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 347 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 348 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 349 350 Level: beginner 351 352 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 353 M 354 M*/ 355