1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 8 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 9 { 10 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 11 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 12 PetscErrorCode ierr; 13 PetscInt i; 14 15 PetscFunctionBegin; 16 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 17 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 18 if (d_nnz) { 19 for (i=0; i<B->rmap->n; i++) { 20 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 21 } 22 } 23 if (o_nnz) { 24 for (i=0; i<B->rmap->n; i++) { 25 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 26 } 27 } 28 if (!B->preallocated) { 29 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 30 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 31 ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr); 32 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 33 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 34 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 35 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 36 ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr); 37 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 38 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 39 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 40 } 41 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 42 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 43 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 44 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 49 50 B->preallocated = PETSC_TRUE; 51 PetscFunctionReturn(0); 52 } 53 54 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 55 { 56 /* 57 This multiplication sequence is different sequence 58 than the CPU version. In particular, the diagonal block 59 multiplication kernel is launched in one stream. Then, 60 in a separate stream, the data transfers from DeviceToHost 61 (with MPI messaging in between), then HostToDevice are 62 launched. Once the data transfer stream is synchronized, 63 to ensure messaging is complete, the MatMultAdd kernel 64 is launched in the original (MatMult) stream to protect 65 against race conditions. 66 */ 67 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 68 PetscErrorCode ierr; 69 PetscInt nt; 70 71 PetscFunctionBegin; 72 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 73 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 74 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 75 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 76 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 77 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 78 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 79 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 80 PetscFunctionReturn(0); 81 } 82 83 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 84 { 85 /* 86 This multiplication sequence is different sequence 87 than the CPU version. In particular, the diagonal block 88 multiplication kernel is launched in one stream. Then, 89 in a separate stream, the data transfers from DeviceToHost 90 (with MPI messaging in between), then HostToDevice are 91 launched. Once the data transfer stream is synchronized, 92 to ensure messaging is complete, the MatMultAdd kernel 93 is launched in the original (MatMult) stream to protect 94 against race conditions. 95 */ 96 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 97 PetscErrorCode ierr; 98 PetscInt nt; 99 100 PetscFunctionBegin; 101 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 102 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 103 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 104 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 105 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 106 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 107 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 108 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 109 PetscFunctionReturn(0); 110 } 111 112 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 113 { 114 /* This multiplication sequence is different sequence 115 than the CPU version. In particular, the diagonal block 116 multiplication kernel is launched in one stream. Then, 117 in a separate stream, the data transfers from DeviceToHost 118 (with MPI messaging in between), then HostToDevice are 119 launched. Once the data transfer stream is synchronized, 120 to ensure messaging is complete, the MatMultAdd kernel 121 is launched in the original (MatMult) stream to protect 122 against race conditions. 123 124 This sequence should only be called for GPU computation. */ 125 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 126 PetscErrorCode ierr; 127 PetscInt nt; 128 129 PetscFunctionBegin; 130 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 131 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 132 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 133 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 134 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 135 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 136 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 137 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 138 PetscFunctionReturn(0); 139 } 140 141 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 142 { 143 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 144 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 145 146 PetscFunctionBegin; 147 switch (op) { 148 case MAT_CUSPARSE_MULT_DIAG: 149 cusparseStruct->diagGPUMatFormat = format; 150 break; 151 case MAT_CUSPARSE_MULT_OFFDIAG: 152 cusparseStruct->offdiagGPUMatFormat = format; 153 break; 154 case MAT_CUSPARSE_ALL: 155 cusparseStruct->diagGPUMatFormat = format; 156 cusparseStruct->offdiagGPUMatFormat = format; 157 break; 158 default: 159 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 160 } 161 PetscFunctionReturn(0); 162 } 163 164 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 165 { 166 MatCUSPARSEStorageFormat format; 167 PetscErrorCode ierr; 168 PetscBool flg; 169 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 170 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 171 172 PetscFunctionBegin; 173 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 174 if (A->factortype==MAT_FACTOR_NONE) { 175 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 176 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 177 if (flg) { 178 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 179 } 180 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 181 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182 if (flg) { 183 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 184 } 185 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 186 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187 if (flg) { 188 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189 } 190 } 191 ierr = PetscOptionsTail();CHKERRQ(ierr); 192 PetscFunctionReturn(0); 193 } 194 195 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 196 { 197 PetscErrorCode ierr; 198 Mat_MPIAIJ *mpiaij; 199 200 PetscFunctionBegin; 201 mpiaij = (Mat_MPIAIJ*)A->data; 202 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 203 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 204 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 205 } 206 PetscFunctionReturn(0); 207 } 208 209 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 210 { 211 PetscErrorCode ierr; 212 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 213 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 214 cudaError_t err; 215 cusparseStatus_t stat; 216 217 PetscFunctionBegin; 218 try { 219 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 220 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 221 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 222 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 223 delete cusparseStruct; 224 } catch(char *ex) { 225 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 226 } 227 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 232 { 233 PetscErrorCode ierr; 234 Mat_MPIAIJ *a; 235 Mat_MPIAIJCUSPARSE * cusparseStruct; 236 cudaError_t err; 237 cusparseStatus_t stat; 238 239 PetscFunctionBegin; 240 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 241 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 242 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 243 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 244 245 a = (Mat_MPIAIJ*)A->data; 246 a->spptr = new Mat_MPIAIJCUSPARSE; 247 248 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 249 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 250 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 251 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 252 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 253 254 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 255 A->ops->mult = MatMult_MPIAIJCUSPARSE; 256 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 257 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 258 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 259 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 260 261 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 262 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 263 PetscFunctionReturn(0); 264 } 265 266 /*@ 267 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 268 (the default parallel PETSc format). This matrix will ultimately pushed down 269 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 270 assembly performance the user should preallocate the matrix storage by setting 271 the parameter nz (or the array nnz). By setting these parameters accurately, 272 performance during matrix assembly can be increased by more than a factor of 50. 273 274 Collective 275 276 Input Parameters: 277 + comm - MPI communicator, set to PETSC_COMM_SELF 278 . m - number of rows 279 . n - number of columns 280 . nz - number of nonzeros per row (same for all rows) 281 - nnz - array containing the number of nonzeros in the various rows 282 (possibly different for each row) or NULL 283 284 Output Parameter: 285 . A - the matrix 286 287 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 288 MatXXXXSetPreallocation() paradigm instead of this routine directly. 289 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 290 291 Notes: 292 If nnz is given then nz is ignored 293 294 The AIJ format (also called the Yale sparse matrix format or 295 compressed row storage), is fully compatible with standard Fortran 77 296 storage. That is, the stored row and column indices can begin at 297 either one (as in Fortran) or zero. See the users' manual for details. 298 299 Specify the preallocated storage with either nz or nnz (not both). 300 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 301 allocation. For large problems you MUST preallocate memory or you 302 will get TERRIBLE performance, see the users' manual chapter on matrices. 303 304 By default, this format uses inodes (identical nodes) when possible, to 305 improve numerical efficiency of matrix-vector products and solves. We 306 search for consecutive rows with the same nonzero structure, thereby 307 reusing matrix information to achieve increased efficiency. 308 309 Level: intermediate 310 311 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 312 @*/ 313 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 314 { 315 PetscErrorCode ierr; 316 PetscMPIInt size; 317 318 PetscFunctionBegin; 319 ierr = MatCreate(comm,A);CHKERRQ(ierr); 320 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 321 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 322 if (size > 1) { 323 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 324 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 325 } else { 326 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 327 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 328 } 329 PetscFunctionReturn(0); 330 } 331 332 /*MC 333 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 334 335 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 336 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 337 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 338 339 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 340 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 341 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 342 for communicators controlling multiple processes. It is recommended that you call both of 343 the above preallocation routines for simplicity. 344 345 Options Database Keys: 346 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 347 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 348 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 349 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 350 351 Level: beginner 352 353 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 354 M 355 M*/ 356