1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 35 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 36 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 37 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 39 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 40 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 41 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 42 } 43 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 44 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 51 52 B->preallocated = PETSC_TRUE; 53 PetscFunctionReturn(0); 54 } 55 56 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 57 { 58 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 59 PetscErrorCode ierr; 60 PetscInt nt; 61 62 PetscFunctionBegin; 63 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 64 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 65 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 66 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 67 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 68 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 69 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 70 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 75 { 76 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 77 PetscErrorCode ierr; 78 79 PetscFunctionBegin; 80 if (A->factortype == MAT_FACTOR_NONE) { 81 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 82 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 83 if (d_mat) { 84 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 85 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 86 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 87 cudaError_t err; 88 PetscScalar *vals; 89 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 90 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 91 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 92 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 93 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 94 } 95 } 96 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 97 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 98 99 PetscFunctionReturn(0); 100 } 101 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 102 { 103 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 104 PetscErrorCode ierr; 105 PetscInt nt; 106 107 PetscFunctionBegin; 108 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 109 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 110 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 111 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 112 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 113 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 114 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 115 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 119 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 120 { 121 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 122 PetscErrorCode ierr; 123 PetscInt nt; 124 125 PetscFunctionBegin; 126 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 127 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 128 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 129 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 130 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 131 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 132 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 133 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 134 PetscFunctionReturn(0); 135 } 136 137 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 138 { 139 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 140 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 141 142 PetscFunctionBegin; 143 switch (op) { 144 case MAT_CUSPARSE_MULT_DIAG: 145 cusparseStruct->diagGPUMatFormat = format; 146 break; 147 case MAT_CUSPARSE_MULT_OFFDIAG: 148 cusparseStruct->offdiagGPUMatFormat = format; 149 break; 150 case MAT_CUSPARSE_ALL: 151 cusparseStruct->diagGPUMatFormat = format; 152 cusparseStruct->offdiagGPUMatFormat = format; 153 break; 154 default: 155 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 156 } 157 PetscFunctionReturn(0); 158 } 159 160 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 161 { 162 MatCUSPARSEStorageFormat format; 163 PetscErrorCode ierr; 164 PetscBool flg; 165 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 166 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 167 168 PetscFunctionBegin; 169 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 170 if (A->factortype==MAT_FACTOR_NONE) { 171 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 172 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 173 if (flg) { 174 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 175 } 176 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 177 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 178 if (flg) { 179 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 180 } 181 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 182 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183 if (flg) { 184 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 185 } 186 } 187 ierr = PetscOptionsTail();CHKERRQ(ierr); 188 PetscFunctionReturn(0); 189 } 190 191 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 192 { 193 PetscErrorCode ierr; 194 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 195 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 196 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 197 PetscInt nnz_state = A->nonzerostate; 198 PetscFunctionBegin; 199 if (d_mat) { 200 cudaError_t err; 201 err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 202 } 203 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 204 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 205 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 206 } 207 if (nnz_state > A->nonzerostate) { 208 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 209 } 210 211 PetscFunctionReturn(0); 212 } 213 214 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 215 { 216 PetscErrorCode ierr; 217 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 218 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 219 cudaError_t err; 220 cusparseStatus_t stat; 221 222 PetscFunctionBegin; 223 if (cusparseStruct->deviceMat) { 224 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 225 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 226 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 227 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 228 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 229 if (jaca->compressedrow.use) { 230 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 231 } 232 if (jacb->compressedrow.use) { 233 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 234 } 235 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 236 err = cudaFree(d_mat);CHKERRCUDA(err); 237 } 238 try { 239 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 240 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 241 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 242 if (cusparseStruct->stream) { 243 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 244 } 245 delete cusparseStruct; 246 } catch(char *ex) { 247 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 248 } 249 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 250 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 251 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 252 PetscFunctionReturn(0); 253 } 254 255 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 256 { 257 PetscErrorCode ierr; 258 Mat_MPIAIJ *a; 259 Mat_MPIAIJCUSPARSE *cusparseStruct; 260 cusparseStatus_t stat; 261 Mat A; 262 263 PetscFunctionBegin; 264 if (reuse == MAT_INITIAL_MATRIX) { 265 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 266 } else if (reuse == MAT_REUSE_MATRIX) { 267 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 268 } 269 A = *newmat; 270 271 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 272 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 273 274 a = (Mat_MPIAIJ*)A->data; 275 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 276 a->spptr = new Mat_MPIAIJCUSPARSE; 277 278 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 279 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 280 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 281 cusparseStruct->stream = 0; 282 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 283 cusparseStruct->deviceMat = NULL; 284 } 285 286 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 287 A->ops->mult = MatMult_MPIAIJCUSPARSE; 288 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 289 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 290 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 291 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 292 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 293 294 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 295 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 296 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 300 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 301 { 302 PetscErrorCode ierr; 303 304 PetscFunctionBegin; 305 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 306 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 307 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 308 PetscFunctionReturn(0); 309 } 310 311 /*@ 312 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 313 (the default parallel PETSc format). This matrix will ultimately pushed down 314 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 315 assembly performance the user should preallocate the matrix storage by setting 316 the parameter nz (or the array nnz). By setting these parameters accurately, 317 performance during matrix assembly can be increased by more than a factor of 50. 318 319 Collective 320 321 Input Parameters: 322 + comm - MPI communicator, set to PETSC_COMM_SELF 323 . m - number of rows 324 . n - number of columns 325 . nz - number of nonzeros per row (same for all rows) 326 - nnz - array containing the number of nonzeros in the various rows 327 (possibly different for each row) or NULL 328 329 Output Parameter: 330 . A - the matrix 331 332 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 333 MatXXXXSetPreallocation() paradigm instead of this routine directly. 334 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 335 336 Notes: 337 If nnz is given then nz is ignored 338 339 The AIJ format (also called the Yale sparse matrix format or 340 compressed row storage), is fully compatible with standard Fortran 77 341 storage. That is, the stored row and column indices can begin at 342 either one (as in Fortran) or zero. See the users' manual for details. 343 344 Specify the preallocated storage with either nz or nnz (not both). 345 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 346 allocation. For large problems you MUST preallocate memory or you 347 will get TERRIBLE performance, see the users' manual chapter on matrices. 348 349 By default, this format uses inodes (identical nodes) when possible, to 350 improve numerical efficiency of matrix-vector products and solves. We 351 search for consecutive rows with the same nonzero structure, thereby 352 reusing matrix information to achieve increased efficiency. 353 354 Level: intermediate 355 356 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 357 @*/ 358 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 359 { 360 PetscErrorCode ierr; 361 PetscMPIInt size; 362 363 PetscFunctionBegin; 364 ierr = MatCreate(comm,A);CHKERRQ(ierr); 365 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 366 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 367 if (size > 1) { 368 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 369 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 370 } else { 371 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 372 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 373 } 374 PetscFunctionReturn(0); 375 } 376 377 /*MC 378 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 379 380 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 381 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 382 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 383 384 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 385 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 386 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 387 for communicators controlling multiple processes. It is recommended that you call both of 388 the above preallocation routines for simplicity. 389 390 Options Database Keys: 391 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 392 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 393 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 394 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 395 396 Level: beginner 397 398 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 399 M 400 M*/ 401 402 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 403 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 404 { 405 #if defined(PETSC_USE_CTABLE) 406 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 407 #else 408 PetscSplitCSRDataStructure **p_d_mat; 409 PetscMPIInt size,rank; 410 MPI_Comm comm; 411 PetscErrorCode ierr; 412 int *ai,*bi,*aj,*bj; 413 PetscScalar *aa,*ba; 414 415 PetscFunctionBegin; 416 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 417 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 418 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 419 if (A->factortype == MAT_FACTOR_NONE) { 420 CsrMatrix *matrixA,*matrixB=NULL; 421 if (size == 1) { 422 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 423 p_d_mat = &cusparsestruct->deviceMat; 424 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 425 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 426 matrixA = (CsrMatrix*)matstruct->mat; 427 bi = bj = NULL; ba = NULL; 428 } else { 429 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 430 } 431 } else { 432 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 433 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 434 p_d_mat = &spptr->deviceMat; 435 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 436 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 437 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 438 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 439 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 440 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 441 matrixA = (CsrMatrix*)matstructA->mat; 442 matrixB = (CsrMatrix*)matstructB->mat; 443 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 444 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 445 ba = thrust::raw_pointer_cast(matrixB->values->data()); 446 if (rank==-1) { 447 for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 448 std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 449 for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 450 std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 451 for(unsigned int i = 0; i < matrixB->values->size(); i++) 452 std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 453 } 454 } else { 455 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 456 } 457 } 458 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 459 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 460 aa = thrust::raw_pointer_cast(matrixA->values->data()); 461 } else { 462 *B = NULL; 463 PetscFunctionReturn(0); 464 } 465 // act like MatSetValues because not called on host 466 if (A->assembled) { 467 if (A->was_assembled) { 468 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 469 } 470 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 471 } else { 472 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 473 } 474 if (!*p_d_mat) { 475 cudaError_t err; 476 PetscSplitCSRDataStructure *d_mat, h_mat; 477 Mat_SeqAIJ *jaca; 478 PetscInt n = A->rmap->n, nnz; 479 // create and copy 480 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 481 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 482 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 483 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 484 if (size == 1) { 485 jaca = (Mat_SeqAIJ*)A->data; 486 h_mat.nonzerostate = A->nonzerostate; 487 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 488 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 489 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 490 h_mat.offdiag.a = NULL; 491 h_mat.seq = PETSC_TRUE; 492 } else { 493 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 494 Mat_SeqAIJ *jacb; 495 h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE 496 jaca = (Mat_SeqAIJ*)aij->A->data; 497 jacb = (Mat_SeqAIJ*)aij->B->data; 498 h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state? 499 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 500 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 501 // create colmap - this is ussually done (lazy) in MatSetValues 502 aij->donotstash = PETSC_TRUE; 503 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 504 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 505 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 506 aij->colmap[A->cmap->N] = -9; 507 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 508 { 509 PetscInt ii; 510 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 511 } 512 if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 513 // allocate B copy data 514 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 515 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 516 nnz = jacb->i[n]; 517 518 if (jacb->compressedrow.use) { 519 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 520 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 521 } else { 522 h_mat.offdiag.i = bi; 523 } 524 h_mat.offdiag.j = bj; 525 h_mat.offdiag.a = ba; 526 527 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 528 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 529 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 530 h_mat.offdiag.n = n; 531 } 532 // allocate A copy data 533 nnz = jaca->i[n]; 534 h_mat.diag.n = n; 535 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 536 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 537 if (jaca->compressedrow.use) { 538 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 539 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 540 } else { 541 h_mat.diag.i = ai; 542 } 543 h_mat.diag.j = aj; 544 h_mat.diag.a = aa; 545 // copy pointers and metdata to device 546 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 547 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 548 } else { 549 *B = *p_d_mat; 550 } 551 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 552 PetscFunctionReturn(0); 553 #endif 554 } 555