1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 35 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 37 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 40 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 42 } 43 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 44 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 57 { 58 /* 59 This multiplication sequence is different sequence 60 than the CPU version. In particular, the diagonal block 61 multiplication kernel is launched in one stream. Then, 62 in a separate stream, the data transfers from DeviceToHost 63 (with MPI messaging in between), then HostToDevice are 64 launched. Once the data transfer stream is synchronized, 65 to ensure messaging is complete, the MatMultAdd kernel 66 is launched in the original (MatMult) stream to protect 67 against race conditions. 68 */ 69 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 70 PetscErrorCode ierr; 71 PetscInt nt; 72 73 PetscFunctionBegin; 74 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 75 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 76 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 77 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 78 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 79 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 80 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 81 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 82 PetscFunctionReturn(0); 83 } 84 85 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 86 { 87 /* 88 This multiplication sequence is different sequence 89 than the CPU version. In particular, the diagonal block 90 multiplication kernel is launched in one stream. Then, 91 in a separate stream, the data transfers from DeviceToHost 92 (with MPI messaging in between), then HostToDevice are 93 launched. Once the data transfer stream is synchronized, 94 to ensure messaging is complete, the MatMultAdd kernel 95 is launched in the original (MatMult) stream to protect 96 against race conditions. 97 */ 98 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 99 PetscErrorCode ierr; 100 PetscInt nt; 101 102 PetscFunctionBegin; 103 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 104 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 105 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 106 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 107 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 108 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 109 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 110 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 111 PetscFunctionReturn(0); 112 } 113 114 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 115 { 116 /* This multiplication sequence is different sequence 117 than the CPU version. In particular, the diagonal block 118 multiplication kernel is launched in one stream. Then, 119 in a separate stream, the data transfers from DeviceToHost 120 (with MPI messaging in between), then HostToDevice are 121 launched. Once the data transfer stream is synchronized, 122 to ensure messaging is complete, the MatMultAdd kernel 123 is launched in the original (MatMult) stream to protect 124 against race conditions. 125 126 This sequence should only be called for GPU computation. */ 127 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 128 PetscErrorCode ierr; 129 PetscInt nt; 130 131 PetscFunctionBegin; 132 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 133 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 134 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 135 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 136 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 137 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 138 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 139 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 140 PetscFunctionReturn(0); 141 } 142 143 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 144 { 145 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 146 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 147 148 PetscFunctionBegin; 149 switch (op) { 150 case MAT_CUSPARSE_MULT_DIAG: 151 cusparseStruct->diagGPUMatFormat = format; 152 break; 153 case MAT_CUSPARSE_MULT_OFFDIAG: 154 cusparseStruct->offdiagGPUMatFormat = format; 155 break; 156 case MAT_CUSPARSE_ALL: 157 cusparseStruct->diagGPUMatFormat = format; 158 cusparseStruct->offdiagGPUMatFormat = format; 159 break; 160 default: 161 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 162 } 163 PetscFunctionReturn(0); 164 } 165 166 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 167 { 168 MatCUSPARSEStorageFormat format; 169 PetscErrorCode ierr; 170 PetscBool flg; 171 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 172 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 173 174 PetscFunctionBegin; 175 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 176 if (A->factortype==MAT_FACTOR_NONE) { 177 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 178 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 179 if (flg) { 180 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 181 } 182 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 183 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 184 if (flg) { 185 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 186 } 187 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 191 } 192 } 193 ierr = PetscOptionsTail();CHKERRQ(ierr); 194 PetscFunctionReturn(0); 195 } 196 197 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 198 { 199 PetscErrorCode ierr; 200 Mat_MPIAIJ *mpiaij; 201 202 PetscFunctionBegin; 203 mpiaij = (Mat_MPIAIJ*)A->data; 204 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 205 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 206 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 207 } 208 PetscFunctionReturn(0); 209 } 210 211 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 212 { 213 PetscErrorCode ierr; 214 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 215 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 216 cudaError_t err; 217 cusparseStatus_t stat; 218 219 PetscFunctionBegin; 220 try { 221 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 222 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 223 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 224 if (cusparseStruct->stream) { 225 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 226 } 227 delete cusparseStruct; 228 } catch(char *ex) { 229 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 230 } 231 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 232 PetscFunctionReturn(0); 233 } 234 235 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 236 { 237 PetscErrorCode ierr; 238 Mat_MPIAIJ *a; 239 Mat_MPIAIJCUSPARSE * cusparseStruct; 240 cusparseStatus_t stat; 241 242 PetscFunctionBegin; 243 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 244 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 245 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 246 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 247 248 a = (Mat_MPIAIJ*)A->data; 249 a->spptr = new Mat_MPIAIJCUSPARSE; 250 251 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 252 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 253 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 254 cusparseStruct->stream = 0; 255 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 256 257 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 258 A->ops->mult = MatMult_MPIAIJCUSPARSE; 259 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 260 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 261 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 262 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 263 264 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 265 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 266 PetscFunctionReturn(0); 267 } 268 269 /*@ 270 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 271 (the default parallel PETSc format). This matrix will ultimately pushed down 272 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 273 assembly performance the user should preallocate the matrix storage by setting 274 the parameter nz (or the array nnz). By setting these parameters accurately, 275 performance during matrix assembly can be increased by more than a factor of 50. 276 277 Collective 278 279 Input Parameters: 280 + comm - MPI communicator, set to PETSC_COMM_SELF 281 . m - number of rows 282 . n - number of columns 283 . nz - number of nonzeros per row (same for all rows) 284 - nnz - array containing the number of nonzeros in the various rows 285 (possibly different for each row) or NULL 286 287 Output Parameter: 288 . A - the matrix 289 290 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 291 MatXXXXSetPreallocation() paradigm instead of this routine directly. 292 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 293 294 Notes: 295 If nnz is given then nz is ignored 296 297 The AIJ format (also called the Yale sparse matrix format or 298 compressed row storage), is fully compatible with standard Fortran 77 299 storage. That is, the stored row and column indices can begin at 300 either one (as in Fortran) or zero. See the users' manual for details. 301 302 Specify the preallocated storage with either nz or nnz (not both). 303 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 304 allocation. For large problems you MUST preallocate memory or you 305 will get TERRIBLE performance, see the users' manual chapter on matrices. 306 307 By default, this format uses inodes (identical nodes) when possible, to 308 improve numerical efficiency of matrix-vector products and solves. We 309 search for consecutive rows with the same nonzero structure, thereby 310 reusing matrix information to achieve increased efficiency. 311 312 Level: intermediate 313 314 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 315 @*/ 316 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 317 { 318 PetscErrorCode ierr; 319 PetscMPIInt size; 320 321 PetscFunctionBegin; 322 ierr = MatCreate(comm,A);CHKERRQ(ierr); 323 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 324 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 325 if (size > 1) { 326 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 327 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 328 } else { 329 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 330 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 331 } 332 PetscFunctionReturn(0); 333 } 334 335 /*MC 336 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 337 338 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 339 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 340 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 341 342 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 343 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 344 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 345 for communicators controlling multiple processes. It is recommended that you call both of 346 the above preallocation routines for simplicity. 347 348 Options Database Keys: 349 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 350 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 351 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 352 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 353 354 Level: beginner 355 356 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 357 M 358 M*/ 359