1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 35 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 36 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 37 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 38 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 39 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 40 } 41 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 42 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 43 if (b->lvec) { 44 ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 45 } 46 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 47 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 53 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 54 55 B->preallocated = PETSC_TRUE; 56 PetscFunctionReturn(0); 57 } 58 59 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 60 { 61 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 62 PetscErrorCode ierr; 63 PetscInt nt; 64 65 PetscFunctionBegin; 66 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 67 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 68 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 69 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 70 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 71 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 72 PetscFunctionReturn(0); 73 } 74 75 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 76 { 77 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 78 PetscErrorCode ierr; 79 80 PetscFunctionBegin; 81 if (A->factortype == MAT_FACTOR_NONE) { 82 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 83 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 84 if (d_mat) { 85 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 86 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 87 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 88 cudaError_t err; 89 PetscScalar *vals; 90 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 91 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 92 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 93 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 94 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 95 } 96 } 97 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 98 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 99 100 PetscFunctionReturn(0); 101 } 102 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 103 { 104 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 105 PetscErrorCode ierr; 106 PetscInt nt; 107 108 PetscFunctionBegin; 109 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 110 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 111 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 112 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 113 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 114 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 115 PetscFunctionReturn(0); 116 } 117 118 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 119 { 120 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 121 PetscErrorCode ierr; 122 PetscInt nt; 123 124 PetscFunctionBegin; 125 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 126 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 127 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 128 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 129 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 130 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 135 { 136 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 137 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 138 139 PetscFunctionBegin; 140 switch (op) { 141 case MAT_CUSPARSE_MULT_DIAG: 142 cusparseStruct->diagGPUMatFormat = format; 143 break; 144 case MAT_CUSPARSE_MULT_OFFDIAG: 145 cusparseStruct->offdiagGPUMatFormat = format; 146 break; 147 case MAT_CUSPARSE_ALL: 148 cusparseStruct->diagGPUMatFormat = format; 149 cusparseStruct->offdiagGPUMatFormat = format; 150 break; 151 default: 152 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 153 } 154 PetscFunctionReturn(0); 155 } 156 157 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 158 { 159 MatCUSPARSEStorageFormat format; 160 PetscErrorCode ierr; 161 PetscBool flg; 162 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 163 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 164 165 PetscFunctionBegin; 166 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 167 if (A->factortype==MAT_FACTOR_NONE) { 168 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 169 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 170 if (flg) { 171 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 172 } 173 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 174 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 175 if (flg) { 176 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 177 } 178 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 179 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 180 if (flg) { 181 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 182 } 183 } 184 ierr = PetscOptionsTail();CHKERRQ(ierr); 185 PetscFunctionReturn(0); 186 } 187 188 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 189 { 190 PetscErrorCode ierr; 191 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 192 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 193 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 194 PetscInt nnz_state = A->nonzerostate; 195 PetscFunctionBegin; 196 if (d_mat) { 197 cudaError_t err; 198 err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 199 } 200 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 201 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 202 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 203 } 204 if (nnz_state > A->nonzerostate) { 205 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 206 } 207 208 PetscFunctionReturn(0); 209 } 210 211 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 212 { 213 PetscErrorCode ierr; 214 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 215 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 216 cudaError_t err; 217 cusparseStatus_t stat; 218 219 PetscFunctionBegin; 220 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 221 if (cusparseStruct->deviceMat) { 222 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 223 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 224 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 225 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 226 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 227 if (jaca->compressedrow.use) { 228 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 229 } 230 if (jacb->compressedrow.use) { 231 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 232 } 233 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 234 err = cudaFree(d_mat);CHKERRCUDA(err); 235 } 236 try { 237 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 238 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 239 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 240 if (cusparseStruct->stream) { 241 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 242 } 243 delete cusparseStruct; 244 } catch(char *ex) { 245 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 246 } 247 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 248 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 249 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 250 PetscFunctionReturn(0); 251 } 252 253 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 254 { 255 PetscErrorCode ierr; 256 Mat_MPIAIJ *a; 257 Mat_MPIAIJCUSPARSE *cusparseStruct; 258 cusparseStatus_t stat; 259 Mat A; 260 261 PetscFunctionBegin; 262 if (reuse == MAT_INITIAL_MATRIX) { 263 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 264 } else if (reuse == MAT_REUSE_MATRIX) { 265 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 266 } 267 A = *newmat; 268 269 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 270 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 271 272 a = (Mat_MPIAIJ*)A->data; 273 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 274 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 275 if (a->lvec) { 276 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 277 } 278 279 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 280 a->spptr = new Mat_MPIAIJCUSPARSE; 281 282 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 283 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 284 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 285 cusparseStruct->stream = 0; 286 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 287 cusparseStruct->deviceMat = NULL; 288 } 289 290 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 291 A->ops->mult = MatMult_MPIAIJCUSPARSE; 292 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 293 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 294 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 295 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 296 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 297 298 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 299 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 300 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 301 PetscFunctionReturn(0); 302 } 303 304 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 305 { 306 PetscErrorCode ierr; 307 308 PetscFunctionBegin; 309 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 310 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 311 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 312 PetscFunctionReturn(0); 313 } 314 315 /*@ 316 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 317 (the default parallel PETSc format). This matrix will ultimately pushed down 318 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 319 assembly performance the user should preallocate the matrix storage by setting 320 the parameter nz (or the array nnz). By setting these parameters accurately, 321 performance during matrix assembly can be increased by more than a factor of 50. 322 323 Collective 324 325 Input Parameters: 326 + comm - MPI communicator, set to PETSC_COMM_SELF 327 . m - number of rows 328 . n - number of columns 329 . nz - number of nonzeros per row (same for all rows) 330 - nnz - array containing the number of nonzeros in the various rows 331 (possibly different for each row) or NULL 332 333 Output Parameter: 334 . A - the matrix 335 336 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 337 MatXXXXSetPreallocation() paradigm instead of this routine directly. 338 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 339 340 Notes: 341 If nnz is given then nz is ignored 342 343 The AIJ format (also called the Yale sparse matrix format or 344 compressed row storage), is fully compatible with standard Fortran 77 345 storage. That is, the stored row and column indices can begin at 346 either one (as in Fortran) or zero. See the users' manual for details. 347 348 Specify the preallocated storage with either nz or nnz (not both). 349 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 350 allocation. For large problems you MUST preallocate memory or you 351 will get TERRIBLE performance, see the users' manual chapter on matrices. 352 353 By default, this format uses inodes (identical nodes) when possible, to 354 improve numerical efficiency of matrix-vector products and solves. We 355 search for consecutive rows with the same nonzero structure, thereby 356 reusing matrix information to achieve increased efficiency. 357 358 Level: intermediate 359 360 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 361 @*/ 362 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 363 { 364 PetscErrorCode ierr; 365 PetscMPIInt size; 366 367 PetscFunctionBegin; 368 ierr = MatCreate(comm,A);CHKERRQ(ierr); 369 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 370 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 371 if (size > 1) { 372 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 373 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 374 } else { 375 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 376 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 377 } 378 PetscFunctionReturn(0); 379 } 380 381 /*MC 382 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 383 384 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 385 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 386 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 387 388 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 389 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 390 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 391 for communicators controlling multiple processes. It is recommended that you call both of 392 the above preallocation routines for simplicity. 393 394 Options Database Keys: 395 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 396 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 397 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 398 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 399 400 Level: beginner 401 402 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 403 M 404 M*/ 405 406 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 407 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 408 { 409 #if defined(PETSC_USE_CTABLE) 410 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 411 #else 412 PetscSplitCSRDataStructure **p_d_mat; 413 PetscMPIInt size,rank; 414 MPI_Comm comm; 415 PetscErrorCode ierr; 416 int *ai,*bi,*aj,*bj; 417 PetscScalar *aa,*ba; 418 419 PetscFunctionBegin; 420 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 421 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 422 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 423 if (A->factortype == MAT_FACTOR_NONE) { 424 CsrMatrix *matrixA,*matrixB=NULL; 425 if (size == 1) { 426 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 427 p_d_mat = &cusparsestruct->deviceMat; 428 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 429 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 430 matrixA = (CsrMatrix*)matstruct->mat; 431 bi = bj = NULL; ba = NULL; 432 } else { 433 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 434 } 435 } else { 436 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 437 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 438 p_d_mat = &spptr->deviceMat; 439 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 440 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 441 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 442 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 443 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 444 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 445 matrixA = (CsrMatrix*)matstructA->mat; 446 matrixB = (CsrMatrix*)matstructB->mat; 447 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 448 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 449 ba = thrust::raw_pointer_cast(matrixB->values->data()); 450 if (rank==-1) { 451 for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 452 std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 453 for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 454 std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 455 for(unsigned int i = 0; i < matrixB->values->size(); i++) 456 std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 457 } 458 } else { 459 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 460 } 461 } 462 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 463 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 464 aa = thrust::raw_pointer_cast(matrixA->values->data()); 465 } else { 466 *B = NULL; 467 PetscFunctionReturn(0); 468 } 469 // act like MatSetValues because not called on host 470 if (A->assembled) { 471 if (A->was_assembled) { 472 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 473 } 474 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 475 } else { 476 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 477 } 478 if (!*p_d_mat) { 479 cudaError_t err; 480 PetscSplitCSRDataStructure *d_mat, h_mat; 481 Mat_SeqAIJ *jaca; 482 PetscInt n = A->rmap->n, nnz; 483 // create and copy 484 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 485 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 486 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 487 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 488 if (size == 1) { 489 jaca = (Mat_SeqAIJ*)A->data; 490 h_mat.nonzerostate = A->nonzerostate; 491 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 492 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 493 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 494 h_mat.offdiag.a = NULL; 495 h_mat.seq = PETSC_TRUE; 496 } else { 497 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 498 Mat_SeqAIJ *jacb; 499 h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE 500 jaca = (Mat_SeqAIJ*)aij->A->data; 501 jacb = (Mat_SeqAIJ*)aij->B->data; 502 h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state? 503 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 504 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 505 // create colmap - this is ussually done (lazy) in MatSetValues 506 aij->donotstash = PETSC_TRUE; 507 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 508 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 509 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 510 aij->colmap[A->cmap->N] = -9; 511 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 512 { 513 PetscInt ii; 514 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 515 } 516 if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 517 // allocate B copy data 518 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 519 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 520 nnz = jacb->i[n]; 521 522 if (jacb->compressedrow.use) { 523 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 524 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 525 } else { 526 h_mat.offdiag.i = bi; 527 } 528 h_mat.offdiag.j = bj; 529 h_mat.offdiag.a = ba; 530 531 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 532 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 533 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 534 h_mat.offdiag.n = n; 535 } 536 // allocate A copy data 537 nnz = jaca->i[n]; 538 h_mat.diag.n = n; 539 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 540 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 541 if (jaca->compressedrow.use) { 542 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 543 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 544 } else { 545 h_mat.diag.i = ai; 546 } 547 h_mat.diag.j = aj; 548 h_mat.diag.a = aa; 549 // copy pointers and metdata to device 550 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 551 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 552 } else { 553 *B = *p_d_mat; 554 } 555 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 556 PetscFunctionReturn(0); 557 #endif 558 } 559