1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 9 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 10 { 11 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 13 PetscErrorCode ierr; 14 PetscInt i; 15 16 PetscFunctionBegin; 17 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 18 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 19 if (d_nnz) { 20 for (i=0; i<B->rmap->n; i++) { 21 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 22 } 23 } 24 if (o_nnz) { 25 for (i=0; i<B->rmap->n; i++) { 26 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 27 } 28 } 29 if (!B->preallocated) { 30 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 31 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 32 ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr); 33 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 34 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 35 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 36 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 37 ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr); 38 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 39 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 40 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 41 } 42 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 43 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 50 51 B->preallocated = PETSC_TRUE; 52 PetscFunctionReturn(0); 53 } 54 55 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 56 { 57 /* 58 This multiplication sequence is different sequence 59 than the CPU version. In particular, the diagonal block 60 multiplication kernel is launched in one stream. Then, 61 in a separate stream, the data transfers from DeviceToHost 62 (with MPI messaging in between), then HostToDevice are 63 launched. Once the data transfer stream is synchronized, 64 to ensure messaging is complete, the MatMultAdd kernel 65 is launched in the original (MatMult) stream to protect 66 against race conditions. 67 */ 68 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 69 PetscErrorCode ierr; 70 PetscInt nt; 71 72 PetscFunctionBegin; 73 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 74 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 75 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 76 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 77 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 78 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 79 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 80 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 81 PetscFunctionReturn(0); 82 } 83 84 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 85 { 86 /* 87 This multiplication sequence is different sequence 88 than the CPU version. In particular, the diagonal block 89 multiplication kernel is launched in one stream. Then, 90 in a separate stream, the data transfers from DeviceToHost 91 (with MPI messaging in between), then HostToDevice are 92 launched. Once the data transfer stream is synchronized, 93 to ensure messaging is complete, the MatMultAdd kernel 94 is launched in the original (MatMult) stream to protect 95 against race conditions. 96 */ 97 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 98 PetscErrorCode ierr; 99 PetscInt nt; 100 101 PetscFunctionBegin; 102 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 103 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 104 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 105 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 106 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 107 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 108 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 109 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 113 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 114 { 115 /* This multiplication sequence is different sequence 116 than the CPU version. In particular, the diagonal block 117 multiplication kernel is launched in one stream. Then, 118 in a separate stream, the data transfers from DeviceToHost 119 (with MPI messaging in between), then HostToDevice are 120 launched. Once the data transfer stream is synchronized, 121 to ensure messaging is complete, the MatMultAdd kernel 122 is launched in the original (MatMult) stream to protect 123 against race conditions. 124 125 This sequence should only be called for GPU computation. */ 126 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 127 PetscErrorCode ierr; 128 PetscInt nt; 129 130 PetscFunctionBegin; 131 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 132 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 133 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 134 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 135 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 136 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 137 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 138 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 139 PetscFunctionReturn(0); 140 } 141 142 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 143 { 144 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 145 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 146 147 PetscFunctionBegin; 148 switch (op) { 149 case MAT_CUSPARSE_MULT_DIAG: 150 cusparseStruct->diagGPUMatFormat = format; 151 break; 152 case MAT_CUSPARSE_MULT_OFFDIAG: 153 cusparseStruct->offdiagGPUMatFormat = format; 154 break; 155 case MAT_CUSPARSE_ALL: 156 cusparseStruct->diagGPUMatFormat = format; 157 cusparseStruct->offdiagGPUMatFormat = format; 158 break; 159 default: 160 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 161 } 162 PetscFunctionReturn(0); 163 } 164 165 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 166 { 167 MatCUSPARSEStorageFormat format; 168 PetscErrorCode ierr; 169 PetscBool flg; 170 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 171 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 172 173 PetscFunctionBegin; 174 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 175 if (A->factortype==MAT_FACTOR_NONE) { 176 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 177 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 178 if (flg) { 179 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 180 } 181 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 182 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183 if (flg) { 184 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 185 } 186 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 187 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 188 if (flg) { 189 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 190 } 191 } 192 ierr = PetscOptionsTail();CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 197 { 198 PetscErrorCode ierr; 199 Mat_MPIAIJ *mpiaij; 200 201 PetscFunctionBegin; 202 mpiaij = (Mat_MPIAIJ*)A->data; 203 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 204 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 205 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 206 } 207 PetscFunctionReturn(0); 208 } 209 210 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 211 { 212 PetscErrorCode ierr; 213 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 214 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 215 cudaError_t err; 216 cusparseStatus_t stat; 217 218 PetscFunctionBegin; 219 try { 220 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 221 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 222 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 223 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 224 delete cusparseStruct; 225 } catch(char *ex) { 226 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 227 } 228 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 233 { 234 PetscErrorCode ierr; 235 Mat_MPIAIJ *a; 236 Mat_MPIAIJCUSPARSE * cusparseStruct; 237 cudaError_t err; 238 cusparseStatus_t stat; 239 240 PetscFunctionBegin; 241 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 242 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 243 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 244 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 245 246 a = (Mat_MPIAIJ*)A->data; 247 a->spptr = new Mat_MPIAIJCUSPARSE; 248 249 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 250 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 251 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 252 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 253 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 254 255 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 256 A->ops->mult = MatMult_MPIAIJCUSPARSE; 257 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 258 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 259 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 260 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 261 262 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 263 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 264 PetscFunctionReturn(0); 265 } 266 267 /*@ 268 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 269 (the default parallel PETSc format). This matrix will ultimately pushed down 270 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 271 assembly performance the user should preallocate the matrix storage by setting 272 the parameter nz (or the array nnz). By setting these parameters accurately, 273 performance during matrix assembly can be increased by more than a factor of 50. 274 275 Collective 276 277 Input Parameters: 278 + comm - MPI communicator, set to PETSC_COMM_SELF 279 . m - number of rows 280 . n - number of columns 281 . nz - number of nonzeros per row (same for all rows) 282 - nnz - array containing the number of nonzeros in the various rows 283 (possibly different for each row) or NULL 284 285 Output Parameter: 286 . A - the matrix 287 288 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 289 MatXXXXSetPreallocation() paradigm instead of this routine directly. 290 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 291 292 Notes: 293 If nnz is given then nz is ignored 294 295 The AIJ format (also called the Yale sparse matrix format or 296 compressed row storage), is fully compatible with standard Fortran 77 297 storage. That is, the stored row and column indices can begin at 298 either one (as in Fortran) or zero. See the users' manual for details. 299 300 Specify the preallocated storage with either nz or nnz (not both). 301 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 302 allocation. For large problems you MUST preallocate memory or you 303 will get TERRIBLE performance, see the users' manual chapter on matrices. 304 305 By default, this format uses inodes (identical nodes) when possible, to 306 improve numerical efficiency of matrix-vector products and solves. We 307 search for consecutive rows with the same nonzero structure, thereby 308 reusing matrix information to achieve increased efficiency. 309 310 Level: intermediate 311 312 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 313 @*/ 314 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 315 { 316 PetscErrorCode ierr; 317 PetscMPIInt size; 318 319 PetscFunctionBegin; 320 ierr = MatCreate(comm,A);CHKERRQ(ierr); 321 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 322 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 323 if (size > 1) { 324 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 325 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 326 } else { 327 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 328 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 329 } 330 PetscFunctionReturn(0); 331 } 332 333 /*MC 334 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 335 336 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 337 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 338 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 339 340 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 341 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 342 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 343 for communicators controlling multiple processes. It is recommended that you call both of 344 the above preallocation routines for simplicity. 345 346 Options Database Keys: 347 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 348 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 349 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 350 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 351 352 Level: beginner 353 354 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 355 M 356 M*/ 357