1 #define PETSC_SKIP_COMPLEX 2 3 #include <petscconf.h> 4 PETSC_CUDA_EXTERN_C_BEGIN 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 PETSC_CUDA_EXTERN_C_END 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 12 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 13 { 14 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 15 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 16 PetscErrorCode ierr; 17 PetscInt i; 18 19 PetscFunctionBegin; 20 if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 21 if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 22 if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 23 if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 24 25 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 26 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 27 if (d_nnz) { 28 for (i=0; i<B->rmap->n; i++) { 29 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 30 } 31 } 32 if (o_nnz) { 33 for (i=0; i<B->rmap->n; i++) { 34 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 35 } 36 } 37 if (!B->preallocated) { 38 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 39 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 40 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 41 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 42 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 43 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 44 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 45 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 46 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 47 } 48 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 49 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 53 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 54 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 55 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 56 57 B->preallocated = PETSC_TRUE; 58 PetscFunctionReturn(0); 59 } 60 61 #undef __FUNCT__ 62 #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE" 63 PetscErrorCode MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 64 { 65 PetscErrorCode ierr; 66 PetscInt rbs,cbs; 67 68 PetscFunctionBegin; 69 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 70 if (right) { 71 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 72 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 73 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 74 ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 75 ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr); 76 } 77 if (left) { 78 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 79 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 80 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 81 ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 82 ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr); 83 84 85 } 86 PetscFunctionReturn(0); 87 } 88 89 90 #undef __FUNCT__ 91 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 92 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 93 { 94 /* This multiplication sequence is different sequence 95 than the CPU version. In particular, the diagonal block 96 multiplication kernel is launched in one stream. Then, 97 in a separate stream, the data transfers from DeviceToHost 98 (with MPI messaging in between), then HostToDevice are 99 launched. Once the data transfer stream is synchronized, 100 to ensure messaging is complete, the MatMultAdd kernel 101 is launched in the original (MatMult) stream to protect 102 against race conditions. 103 104 This sequence should only be called for GPU computation. */ 105 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 106 PetscErrorCode ierr; 107 PetscInt nt; 108 109 PetscFunctionBegin; 110 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 111 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 112 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 113 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 114 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 115 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 116 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 117 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 118 PetscFunctionReturn(0); 119 } 120 121 #undef __FUNCT__ 122 #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 123 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 124 { 125 /* This multiplication sequence is different sequence 126 than the CPU version. In particular, the diagonal block 127 multiplication kernel is launched in one stream. Then, 128 in a separate stream, the data transfers from DeviceToHost 129 (with MPI messaging in between), then HostToDevice are 130 launched. Once the data transfer stream is synchronized, 131 to ensure messaging is complete, the MatMultAdd kernel 132 is launched in the original (MatMult) stream to protect 133 against race conditions. 134 135 This sequence should only be called for GPU computation. */ 136 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 137 PetscErrorCode ierr; 138 PetscInt nt; 139 140 PetscFunctionBegin; 141 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 142 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 143 ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 144 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 145 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 146 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 147 ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 148 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 149 PetscFunctionReturn(0); 150 } 151 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 154 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 155 { 156 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 157 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 158 159 PetscFunctionBegin; 160 switch (op) { 161 case MAT_CUSPARSE_MULT_DIAG: 162 cusparseStruct->diagGPUMatFormat = format; 163 break; 164 case MAT_CUSPARSE_MULT_OFFDIAG: 165 cusparseStruct->offdiagGPUMatFormat = format; 166 break; 167 case MAT_CUSPARSE_ALL: 168 cusparseStruct->diagGPUMatFormat = format; 169 cusparseStruct->offdiagGPUMatFormat = format; 170 break; 171 default: 172 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 173 } 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 179 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A) 180 { 181 MatCUSPARSEStorageFormat format; 182 PetscErrorCode ierr; 183 PetscBool flg; 184 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 185 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 186 187 PetscFunctionBegin; 188 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 189 ierr = PetscObjectOptionsBegin((PetscObject)A); 190 if (A->factortype==MAT_FACTOR_NONE) { 191 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 192 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 193 if (flg) { 194 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 195 } 196 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 197 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 198 if (flg) { 199 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 200 } 201 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 202 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 203 if (flg) { 204 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 205 } 206 } 207 ierr = PetscOptionsEnd();CHKERRQ(ierr); 208 PetscFunctionReturn(0); 209 } 210 211 #undef __FUNCT__ 212 #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 213 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 214 { 215 PetscErrorCode ierr; 216 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 217 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 218 cudaError_t err; 219 cusparseStatus_t stat; 220 221 PetscFunctionBegin; 222 try { 223 ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 224 ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 225 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat); 226 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err); 227 delete cusparseStruct; 228 } catch(char *ex) { 229 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 230 } 231 cusparseStruct = 0; 232 233 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 234 PetscFunctionReturn(0); 235 } 236 237 #undef __FUNCT__ 238 #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 239 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 240 { 241 PetscErrorCode ierr; 242 Mat_MPIAIJ *a; 243 Mat_MPIAIJCUSPARSE * cusparseStruct; 244 cudaError_t err; 245 cusparseStatus_t stat; 246 247 PetscFunctionBegin; 248 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 249 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 250 a = (Mat_MPIAIJ*)A->data; 251 a->spptr = new Mat_MPIAIJCUSPARSE; 252 253 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 254 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 255 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 256 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat); 257 err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err); 258 259 A->ops->getvecs = MatCreateVecs_MPIAIJCUSPARSE; 260 A->ops->mult = MatMult_MPIAIJCUSPARSE; 261 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 262 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 263 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 264 265 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 266 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 267 PetscFunctionReturn(0); 268 } 269 270 /*@ 271 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 272 (the default parallel PETSc format). This matrix will ultimately pushed down 273 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 274 assembly performance the user should preallocate the matrix storage by setting 275 the parameter nz (or the array nnz). By setting these parameters accurately, 276 performance during matrix assembly can be increased by more than a factor of 50. 277 278 Collective on MPI_Comm 279 280 Input Parameters: 281 + comm - MPI communicator, set to PETSC_COMM_SELF 282 . m - number of rows 283 . n - number of columns 284 . nz - number of nonzeros per row (same for all rows) 285 - nnz - array containing the number of nonzeros in the various rows 286 (possibly different for each row) or NULL 287 288 Output Parameter: 289 . A - the matrix 290 291 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 292 MatXXXXSetPreallocation() paradigm instead of this routine directly. 293 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 294 295 Notes: 296 If nnz is given then nz is ignored 297 298 The AIJ format (also called the Yale sparse matrix format or 299 compressed row storage), is fully compatible with standard Fortran 77 300 storage. That is, the stored row and column indices can begin at 301 either one (as in Fortran) or zero. See the users' manual for details. 302 303 Specify the preallocated storage with either nz or nnz (not both). 304 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 305 allocation. For large problems you MUST preallocate memory or you 306 will get TERRIBLE performance, see the users' manual chapter on matrices. 307 308 By default, this format uses inodes (identical nodes) when possible, to 309 improve numerical efficiency of matrix-vector products and solves. We 310 search for consecutive rows with the same nonzero structure, thereby 311 reusing matrix information to achieve increased efficiency. 312 313 Level: intermediate 314 315 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 316 @*/ 317 #undef __FUNCT__ 318 #define __FUNCT__ "MatCreateAIJCUSPARSE" 319 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 320 { 321 PetscErrorCode ierr; 322 PetscMPIInt size; 323 324 PetscFunctionBegin; 325 ierr = MatCreate(comm,A);CHKERRQ(ierr); 326 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 327 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 328 if (size > 1) { 329 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 330 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 331 } else { 332 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 333 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 334 } 335 PetscFunctionReturn(0); 336 } 337 338 /*M 339 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 340 341 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 342 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 343 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 344 345 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 346 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 347 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 348 for communicators controlling multiple processes. It is recommended that you call both of 349 the above preallocation routines for simplicity. 350 351 Options Database Keys: 352 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 353 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 354 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 355 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 356 357 Level: beginner 358 359 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 360 M 361 M*/ 362