1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_CXX_COMPLEX_FIX 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 5 #include <petscconf.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 8 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 9 10 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 11 { 12 Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 14 PetscErrorCode ierr; 15 PetscInt i; 16 17 PetscFunctionBegin; 18 ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 19 ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 20 if (d_nnz) { 21 for (i=0; i<B->rmap->n; i++) { 22 if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 23 } 24 } 25 if (o_nnz) { 26 for (i=0; i<B->rmap->n; i++) { 27 if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 28 } 29 } 30 if (!B->preallocated) { 31 /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 32 ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33 ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 34 ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 35 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 36 ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 37 ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 38 ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 39 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 40 } 41 ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 42 ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 43 if (b->lvec) { 44 ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 45 } 46 ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 47 ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 48 ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 49 ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 50 ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 51 ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 52 ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 53 ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 54 55 B->preallocated = PETSC_TRUE; 56 PetscFunctionReturn(0); 57 } 58 59 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 60 { 61 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 62 PetscErrorCode ierr; 63 PetscInt nt; 64 65 PetscFunctionBegin; 66 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 67 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 68 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 69 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 70 ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 71 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 72 ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 73 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 78 { 79 Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 80 PetscErrorCode ierr; 81 82 PetscFunctionBegin; 83 if (A->factortype == MAT_FACTOR_NONE) { 84 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 85 PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 86 if (d_mat) { 87 Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 88 Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 89 PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 90 cudaError_t err; 91 PetscScalar *vals; 92 ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 93 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 94 err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 95 err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 96 err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 97 } 98 } 99 ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 100 ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 101 102 PetscFunctionReturn(0); 103 } 104 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 105 { 106 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 107 PetscErrorCode ierr; 108 PetscInt nt; 109 110 PetscFunctionBegin; 111 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 112 if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 113 ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 114 ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 115 ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 116 ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 117 ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 118 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 123 { 124 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 125 PetscErrorCode ierr; 126 PetscInt nt; 127 128 PetscFunctionBegin; 129 ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 130 if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 131 ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 132 ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 133 ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 134 ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 135 ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 136 ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 137 PetscFunctionReturn(0); 138 } 139 140 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 141 { 142 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 143 Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 144 145 PetscFunctionBegin; 146 switch (op) { 147 case MAT_CUSPARSE_MULT_DIAG: 148 cusparseStruct->diagGPUMatFormat = format; 149 break; 150 case MAT_CUSPARSE_MULT_OFFDIAG: 151 cusparseStruct->offdiagGPUMatFormat = format; 152 break; 153 case MAT_CUSPARSE_ALL: 154 cusparseStruct->diagGPUMatFormat = format; 155 cusparseStruct->offdiagGPUMatFormat = format; 156 break; 157 default: 158 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 159 } 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 164 { 165 MatCUSPARSEStorageFormat format; 166 PetscErrorCode ierr; 167 PetscBool flg; 168 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 169 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 170 171 PetscFunctionBegin; 172 ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 173 if (A->factortype==MAT_FACTOR_NONE) { 174 ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 175 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 176 if (flg) { 177 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 178 } 179 ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 180 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181 if (flg) { 182 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 183 } 184 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 185 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 186 if (flg) { 187 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 188 } 189 } 190 ierr = PetscOptionsTail();CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 195 { 196 PetscErrorCode ierr; 197 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 198 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 199 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 200 PetscInt nnz_state = A->nonzerostate; 201 PetscFunctionBegin; 202 if (d_mat) { 203 cudaError_t err; 204 err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 205 } 206 ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 207 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 208 ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 209 } 210 if (nnz_state > A->nonzerostate) { 211 A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 212 } 213 214 PetscFunctionReturn(0); 215 } 216 217 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 218 { 219 PetscErrorCode ierr; 220 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 221 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 222 cudaError_t err; 223 cusparseStatus_t stat; 224 225 PetscFunctionBegin; 226 if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 227 if (cusparseStruct->deviceMat) { 228 Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 229 Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 230 PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 231 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 232 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 233 if (jaca->compressedrow.use) { 234 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 235 } 236 if (jacb->compressedrow.use) { 237 err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 238 } 239 err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 240 err = cudaFree(d_mat);CHKERRCUDA(err); 241 } 242 try { 243 if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 244 if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 245 stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 246 if (cusparseStruct->stream) { 247 err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 248 } 249 delete cusparseStruct; 250 } catch(char *ex) { 251 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 252 } 253 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 254 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 255 ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 256 PetscFunctionReturn(0); 257 } 258 259 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 260 { 261 PetscErrorCode ierr; 262 Mat_MPIAIJ *a; 263 Mat_MPIAIJCUSPARSE *cusparseStruct; 264 cusparseStatus_t stat; 265 Mat A; 266 267 PetscFunctionBegin; 268 if (reuse == MAT_INITIAL_MATRIX) { 269 ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 270 } else if (reuse == MAT_REUSE_MATRIX) { 271 ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 272 } 273 A = *newmat; 274 275 ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 276 ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 277 278 a = (Mat_MPIAIJ*)A->data; 279 if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 280 if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 281 if (a->lvec) { 282 ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 283 } 284 285 if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 286 a->spptr = new Mat_MPIAIJCUSPARSE; 287 288 cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 289 cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 290 cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 291 cusparseStruct->stream = 0; 292 stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 293 cusparseStruct->deviceMat = NULL; 294 } 295 296 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 297 A->ops->mult = MatMult_MPIAIJCUSPARSE; 298 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 299 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 300 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 301 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 302 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 303 304 ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 305 ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 306 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 311 { 312 PetscErrorCode ierr; 313 314 PetscFunctionBegin; 315 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 316 ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 317 ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 318 PetscFunctionReturn(0); 319 } 320 321 /*@ 322 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 323 (the default parallel PETSc format). This matrix will ultimately pushed down 324 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 325 assembly performance the user should preallocate the matrix storage by setting 326 the parameter nz (or the array nnz). By setting these parameters accurately, 327 performance during matrix assembly can be increased by more than a factor of 50. 328 329 Collective 330 331 Input Parameters: 332 + comm - MPI communicator, set to PETSC_COMM_SELF 333 . m - number of rows 334 . n - number of columns 335 . nz - number of nonzeros per row (same for all rows) 336 - nnz - array containing the number of nonzeros in the various rows 337 (possibly different for each row) or NULL 338 339 Output Parameter: 340 . A - the matrix 341 342 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 343 MatXXXXSetPreallocation() paradigm instead of this routine directly. 344 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 345 346 Notes: 347 If nnz is given then nz is ignored 348 349 The AIJ format (also called the Yale sparse matrix format or 350 compressed row storage), is fully compatible with standard Fortran 77 351 storage. That is, the stored row and column indices can begin at 352 either one (as in Fortran) or zero. See the users' manual for details. 353 354 Specify the preallocated storage with either nz or nnz (not both). 355 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 356 allocation. For large problems you MUST preallocate memory or you 357 will get TERRIBLE performance, see the users' manual chapter on matrices. 358 359 By default, this format uses inodes (identical nodes) when possible, to 360 improve numerical efficiency of matrix-vector products and solves. We 361 search for consecutive rows with the same nonzero structure, thereby 362 reusing matrix information to achieve increased efficiency. 363 364 Level: intermediate 365 366 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 367 @*/ 368 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 369 { 370 PetscErrorCode ierr; 371 PetscMPIInt size; 372 373 PetscFunctionBegin; 374 ierr = MatCreate(comm,A);CHKERRQ(ierr); 375 ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 376 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 377 if (size > 1) { 378 ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 379 ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 380 } else { 381 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 382 ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 383 } 384 PetscFunctionReturn(0); 385 } 386 387 /*MC 388 MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 389 390 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 391 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 392 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 393 394 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 395 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 396 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 397 for communicators controlling multiple processes. It is recommended that you call both of 398 the above preallocation routines for simplicity. 399 400 Options Database Keys: 401 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 402 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 403 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 404 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 405 406 Level: beginner 407 408 .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 409 M 410 M*/ 411 412 // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 413 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 414 { 415 #if defined(PETSC_USE_CTABLE) 416 SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 417 #else 418 PetscSplitCSRDataStructure **p_d_mat; 419 PetscMPIInt size,rank; 420 MPI_Comm comm; 421 PetscErrorCode ierr; 422 int *ai,*bi,*aj,*bj; 423 PetscScalar *aa,*ba; 424 425 PetscFunctionBegin; 426 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 427 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 428 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 429 if (A->factortype == MAT_FACTOR_NONE) { 430 CsrMatrix *matrixA,*matrixB=NULL; 431 if (size == 1) { 432 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 433 p_d_mat = &cusparsestruct->deviceMat; 434 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 435 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 436 matrixA = (CsrMatrix*)matstruct->mat; 437 bi = bj = NULL; ba = NULL; 438 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 439 } else { 440 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 441 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 442 p_d_mat = &spptr->deviceMat; 443 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 444 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 445 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 446 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 447 if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 448 if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 449 matrixA = (CsrMatrix*)matstructA->mat; 450 matrixB = (CsrMatrix*)matstructB->mat; 451 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 452 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 453 ba = thrust::raw_pointer_cast(matrixB->values->data()); 454 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 455 } 456 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 457 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 458 aa = thrust::raw_pointer_cast(matrixA->values->data()); 459 } else { 460 *B = NULL; 461 PetscFunctionReturn(0); 462 } 463 // act like MatSetValues because not called on host 464 if (A->assembled) { 465 if (A->was_assembled) { 466 ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 467 } 468 A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 469 } else { 470 SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 471 } 472 if (!*p_d_mat) { 473 cudaError_t err; 474 PetscSplitCSRDataStructure *d_mat, h_mat; 475 Mat_SeqAIJ *jaca; 476 PetscInt n = A->rmap->n, nnz; 477 // create and copy 478 ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 479 err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 480 err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 481 *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 482 if (size == 1) { 483 jaca = (Mat_SeqAIJ*)A->data; 484 h_mat.nonzerostate = A->nonzerostate; 485 h_mat.rstart = 0; h_mat.rend = A->rmap->n; 486 h_mat.cstart = 0; h_mat.cend = A->cmap->n; 487 h_mat.offdiag.i = h_mat.offdiag.j = NULL; 488 h_mat.offdiag.a = NULL; 489 h_mat.seq = PETSC_TRUE; 490 } else { 491 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 492 Mat_SeqAIJ *jacb; 493 h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE 494 jaca = (Mat_SeqAIJ*)aij->A->data; 495 jacb = (Mat_SeqAIJ*)aij->B->data; 496 h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state? 497 if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 498 if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 499 // create colmap - this is ussually done (lazy) in MatSetValues 500 aij->donotstash = PETSC_TRUE; 501 aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 502 jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 503 ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 504 aij->colmap[A->cmap->N] = -9; 505 ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 506 { 507 PetscInt ii; 508 for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 509 } 510 if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 511 // allocate B copy data 512 h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 513 h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 514 nnz = jacb->i[n]; 515 516 if (jacb->compressedrow.use) { 517 err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 518 err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 519 } else h_mat.offdiag.i = bi; 520 h_mat.offdiag.j = bj; 521 h_mat.offdiag.a = ba; 522 523 err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 524 err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 525 h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 526 h_mat.offdiag.n = n; 527 } 528 // allocate A copy data 529 nnz = jaca->i[n]; 530 h_mat.diag.n = n; 531 h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 532 ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 533 if (jaca->compressedrow.use) { 534 err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 535 err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 536 } else { 537 h_mat.diag.i = ai; 538 } 539 h_mat.diag.j = aj; 540 h_mat.diag.a = aa; 541 // copy pointers and metdata to device 542 err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 543 ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 544 } else { 545 *B = *p_d_mat; 546 } 547 A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 548 PetscFunctionReturn(0); 549 #endif 550 } 551