1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 35 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 37 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 40 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 42 } 43 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 44 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 57 { 58 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 59 PetscErrorCode ierr; 60 PetscInt nt; 61 62 PetscFunctionBegin; 63 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 64 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 65 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 66 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 67 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 68 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 69 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 70 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 75 { 76 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 77 PetscErrorCode ierr; 78 79 PetscFunctionBegin; 80 if (A->factortype == MAT_FACTOR_NONE) { 81 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 82 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 83 if (d_mat) { 84 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 85 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 86 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 87 cudaError_t err; 88 PetscScalar *vals; 89 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 90 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 91 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 92 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 93 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 94 } 95 } 96 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 97 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 98 99 PetscFunctionReturn(0); 100 } 101 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 102 { 103 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 104 PetscErrorCode ierr; 105 PetscInt nt; 106 107 PetscFunctionBegin; 108 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 109 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 110 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 111 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 112 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 113 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 114 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 115 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 119 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 120 { 121 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 122 PetscErrorCode ierr; 123 PetscInt nt; 124 125 PetscFunctionBegin; 126 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 127 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 128 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 129 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 130 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 131 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 132 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 133 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 134 PetscFunctionReturn(0); 135 } 136 137 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 138 { 139 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 140 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 141 142 PetscFunctionBegin; 143 switch (op) { 144 case MAT_CUSPARSE_MULT_DIAG: 145 cusparseStruct->diagGPUMatFormat = format; 146 break; 147 case MAT_CUSPARSE_MULT_OFFDIAG: 148 cusparseStruct->offdiagGPUMatFormat = format; 149 break; 150 case MAT_CUSPARSE_ALL: 151 cusparseStruct->diagGPUMatFormat = format; 152 cusparseStruct->offdiagGPUMatFormat = format; 153 break; 154 default: 155 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 156 } 157 PetscFunctionReturn(0); 158 } 159 160 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 161 { 162 MatCUSPARSEStorageFormat format; 163 PetscErrorCode ierr; 164 PetscBool flg; 165 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 166 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 167 168 PetscFunctionBegin; 169 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 170 if (A->factortype==MAT_FACTOR_NONE) { 171 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 172 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 173 if (flg) { 174 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 175 } 176 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 177 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 178 if (flg) { 179 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 180 } 181 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 182 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183 if (flg) { 184 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 185 } 186 } 187 ierr = PetscOptionsTail();CHKERRQ(ierr); 188 PetscFunctionReturn(0); 189 } 190 191 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 192 { 193 PetscErrorCode ierr; 194 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 195 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 196 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 197 PetscInt nnz_state = A->nonzerostate; 198 PetscFunctionBegin; 199 if (d_mat) { 200 cudaError_t err; 201 err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 202 } 203 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 204 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 205 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 206 } 207 if (nnz_state > A->nonzerostate) { 208 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 209 } 210 211 PetscFunctionReturn(0); 212 } 213 214 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 215 { 216 PetscErrorCode ierr; 217 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 218 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 219 cudaError_t err; 220 cusparseStatus_t stat; 221 222 PetscFunctionBegin; 223 if (cusparseStruct->deviceMat) { 224 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 225 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 226 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 227 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 228 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 229 if (jaca->compressedrow.use) { 230 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 231 } 232 if (jacb->compressedrow.use) { 233 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 234 } 235 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 236 err = cudaFree(d_mat);CHKERRCUDA(err); 237 } 238 try { 239 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 240 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 241 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 242 if (cusparseStruct->stream) { 243 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 244 } 245 delete cusparseStruct; 246 } catch(char *ex) { 247 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 248 } 249 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 250 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 251 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 252 PetscFunctionReturn(0); 253 } 254 255 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 256 { 257 PetscErrorCode ierr; 258 Mat_MPIAIJ *a; 259 Mat_MPIAIJCUSPARSE *cusparseStruct; 260 cusparseStatus_t stat; 261 Mat A; 262 263 PetscFunctionBegin; 264 if (reuse == MAT_INITIAL_MATRIX) { 265 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 266 } else if (reuse == MAT_REUSE_MATRIX) { 267 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 268 } 269 A = *newmat; 270 271 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 272 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 273 274 a = (Mat_MPIAIJ*)A->data; 275 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 276 a->spptr = new Mat_MPIAIJCUSPARSE; 277 278 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 279 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 280 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 281 cusparseStruct->stream = 0; 282 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 283 cusparseStruct->deviceMat = NULL; 284 } 285 286 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 287 A->ops->mult = MatMult_MPIAIJCUSPARSE; 288 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 289 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 290 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 291 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 292 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 293 294 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 295 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 296 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 300 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 301 { 302 PetscErrorCode ierr; 303 304 PetscFunctionBegin; 305 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 306 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 307 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 308 PetscFunctionReturn(0); 309 } 310 311 /*@ 312 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 313 (the default parallel PETSc format). This matrix will ultimately pushed down 314 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 315 assembly performance the user should preallocate the matrix storage by setting 316 the parameter nz (or the array nnz). By setting these parameters accurately, 317 performance during matrix assembly can be increased by more than a factor of 50. 318 319 Collective 320 321 Input Parameters: 322 + comm - MPI communicator, set to PETSC_COMM_SELF 323 . m - number of rows 324 . n - number of columns 325 . nz - number of nonzeros per row (same for all rows) 326 - nnz - array containing the number of nonzeros in the various rows 327 (possibly different for each row) or NULL 328 329 Output Parameter: 330 . A - the matrix 331 332 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 333 MatXXXXSetPreallocation() paradigm instead of this routine directly. 334 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 335 336 Notes: 337 If nnz is given then nz is ignored 338 339 The AIJ format (also called the Yale sparse matrix format or 340 compressed row storage), is fully compatible with standard Fortran 77 341 storage. That is, the stored row and column indices can begin at 342 either one (as in Fortran) or zero. See the users' manual for details. 343 344 Specify the preallocated storage with either nz or nnz (not both). 345 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 346 allocation. For large problems you MUST preallocate memory or you 347 will get TERRIBLE performance, see the users' manual chapter on matrices. 348 349 By default, this format uses inodes (identical nodes) when possible, to 350 improve numerical efficiency of matrix-vector products and solves. We 351 search for consecutive rows with the same nonzero structure, thereby 352 reusing matrix information to achieve increased efficiency. 353 354 Level: intermediate 355 356 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 357 @*/ 358 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 359 { 360 PetscErrorCode ierr; 361 PetscMPIInt size; 362 363 PetscFunctionBegin; 364 ierr = MatCreate(comm,A);CHKERRQ(ierr); 365 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 366 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 367 if (size > 1) { 368 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 369 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 370 } else { 371 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 372 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 373 } 374 PetscFunctionReturn(0); 375 } 376 377 /*MC 378 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 379 380 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 381 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 382 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 383 384 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 385 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 386 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 387 for communicators controlling multiple processes. It is recommended that you call both of 388 the above preallocation routines for simplicity. 389 390 Options Database Keys: 391 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 392 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 393 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 394 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 395 396 Level: beginner 397 398 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 399 M 400 M*/ 401 402 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 403 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 404 { 405 PetscSplitCSRDataStructure **p_d_mat; 406 PetscMPIInt size,rank; 407 MPI_Comm comm; 408 PetscErrorCode ierr; 409 int *ai,*bi,*aj,*bj; 410 PetscScalar *aa,*ba; 411 412 PetscFunctionBegin; 413 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 414 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 415 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 416 if (A->factortype == MAT_FACTOR_NONE) { 417 CsrMatrix *matrixA,*matrixB=NULL; 418 if (size == 1) { 419 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 420 p_d_mat = &cusparsestruct->deviceMat; 421 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 422 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 423 matrixA = (CsrMatrix*)matstruct->mat; 424 bi = bj = NULL; ba = NULL; 425 } else { 426 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 427 } 428 } else { 429 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 430 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 431 p_d_mat = &spptr->deviceMat; 432 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 433 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 434 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 435 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 436 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 437 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 438 matrixA = (CsrMatrix*)matstructA->mat; 439 matrixB = (CsrMatrix*)matstructB->mat; 440 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 441 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 442 ba = thrust::raw_pointer_cast(matrixB->values->data()); 443 if (rank==-1) { 444 for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 445 std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 446 for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 447 std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 448 for(unsigned int i = 0; i < matrixB->values->size(); i++) 449 std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 450 } 451 } else { 452 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 453 } 454 } 455 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 456 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 457 aa = thrust::raw_pointer_cast(matrixA->values->data()); 458 } else { 459 *B = NULL; 460 PetscFunctionReturn(0); 461 } 462 // act like MatSetValues because not called on host 463 if (A->assembled) { 464 if (A->was_assembled) { 465 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 466 } 467 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 468 } else { 469 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 470 } 471 if (!*p_d_mat) { 472 cudaError_t err; 473 PetscSplitCSRDataStructure *d_mat, h_mat; 474 Mat_SeqAIJ *jaca; 475 PetscInt n = A->rmap->n, nnz; 476 // create and copy 477 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 478 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 479 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 480 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 481 if (size == 1) { 482 jaca = (Mat_SeqAIJ*)A->data; 483 h_mat.nonzerostate = A->nonzerostate; 484 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 485 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 486 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 487 h_mat.offdiag.a = NULL; 488 h_mat.seq = PETSC_TRUE; 489 } else { 490 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 491 Mat_SeqAIJ *jacb; 492 h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE 493 jaca = (Mat_SeqAIJ*)aij->A->data; 494 jacb = (Mat_SeqAIJ*)aij->B->data; 495 h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state? 496 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 497 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 498 // create colmap - this is ussually done (lazy) in MatSetValues 499 aij->donotstash = PETSC_TRUE; 500 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 501 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 502 #if defined(PETSC_USE_CTABLE) 503 SETERRQ(comm,PETSC_ERR_SUP,"Devioce metadata does not support ctable (--with-ctable=0)"); 504 #else 505 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 506 aij->colmap[A->cmap->N] = -9; 507 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 508 { 509 PetscInt ii; 510 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 511 } 512 if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 513 #endif 514 // allocate B copy data 515 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 516 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 517 nnz = jacb->i[n]; 518 519 if (jacb->compressedrow.use) { 520 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 521 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 522 } else { 523 h_mat.offdiag.i = bi; 524 } 525 h_mat.offdiag.j = bj; 526 h_mat.offdiag.a = ba; 527 528 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 529 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 530 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 531 h_mat.offdiag.n = n; 532 } 533 // allocate A copy data 534 nnz = jaca->i[n]; 535 h_mat.diag.n = n; 536 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 537 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 538 if (jaca->compressedrow.use) { 539 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 540 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 541 } else { 542 h_mat.diag.i = ai; 543 } 544 h_mat.diag.j = aj; 545 h_mat.diag.a = aa; 546 // copy pointers and metdata to device 547 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 548 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 549 } else { 550 *B = *p_d_mat; 551 } 552 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 553 PetscFunctionReturn(0); 554 } 555