1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 96 { 97 cusparseStatus_t stat; 98 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 99 100 PetscFunctionBegin; 101 PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 102 cusparsestruct->stream = stream; 103 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 104 PetscFunctionReturn(0); 105 } 106 107 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 108 { 109 cusparseStatus_t stat; 110 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 111 112 PetscFunctionBegin; 113 PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 114 if (cusparsestruct->handle != handle) { 115 if (cusparsestruct->handle) { 116 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 117 } 118 cusparsestruct->handle = handle; 119 } 120 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 121 PetscFunctionReturn(0); 122 } 123 124 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 PetscBool flg; 128 PetscErrorCode ierr; 129 130 PetscFunctionBegin; 131 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 132 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 133 if (cusparsestruct->handle) cusparsestruct->handle = 0; 134 PetscFunctionReturn(0); 135 } 136 137 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 138 { 139 PetscFunctionBegin; 140 *type = MATSOLVERCUSPARSE; 141 PetscFunctionReturn(0); 142 } 143 144 /*MC 145 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 146 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 147 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 148 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 149 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 150 algorithms are not recommended. This class does NOT support direct solver operations. 151 152 Level: beginner 153 154 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 155 M*/ 156 157 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 158 { 159 PetscErrorCode ierr; 160 PetscInt n = A->rmap->n; 161 162 PetscFunctionBegin; 163 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 164 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 165 (*B)->factortype = ftype; 166 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 167 168 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 169 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 170 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 171 if (!A->boundtocpu) { 172 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 173 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 174 } else { 175 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 176 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 177 } 178 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 179 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 180 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 181 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 182 if (!A->boundtocpu) { 183 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 184 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 185 } else { 186 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 187 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 188 } 189 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 190 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 191 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 192 193 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 194 (*B)->canuseordering = PETSC_TRUE; 195 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 200 { 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202 203 PetscFunctionBegin; 204 switch (op) { 205 case MAT_CUSPARSE_MULT: 206 cusparsestruct->format = format; 207 break; 208 case MAT_CUSPARSE_ALL: 209 cusparsestruct->format = format; 210 break; 211 default: 212 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 213 } 214 PetscFunctionReturn(0); 215 } 216 217 /*@ 218 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 219 operation. Only the MatMult operation can use different GPU storage formats 220 for MPIAIJCUSPARSE matrices. 221 Not Collective 222 223 Input Parameters: 224 + A - Matrix of type SEQAIJCUSPARSE 225 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 226 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 227 228 Output Parameter: 229 230 Level: intermediate 231 232 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 233 @*/ 234 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 235 { 236 PetscErrorCode ierr; 237 238 PetscFunctionBegin; 239 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 240 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 245 { 246 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 247 248 PetscFunctionBegin; 249 cusparsestruct->use_cpu_solve = use_cpu; 250 PetscFunctionReturn(0); 251 } 252 253 /*@ 254 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 255 256 Input Parameters: 257 + A - Matrix of type SEQAIJCUSPARSE 258 - use_cpu - set flag for using the built-in CPU MatSolve 259 260 Output Parameter: 261 262 Notes: 263 The cuSparse LU solver currently computes the factors with the built-in CPU method 264 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 265 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 266 267 Level: intermediate 268 269 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 270 @*/ 271 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 272 { 273 PetscErrorCode ierr; 274 275 PetscFunctionBegin; 276 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 277 ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 282 { 283 PetscErrorCode ierr; 284 285 PetscFunctionBegin; 286 switch (op) { 287 case MAT_FORM_EXPLICIT_TRANSPOSE: 288 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 289 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 290 A->form_explicit_transpose = flg; 291 break; 292 default: 293 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 294 break; 295 } 296 PetscFunctionReturn(0); 297 } 298 299 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 300 301 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 302 { 303 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 304 IS isrow = b->row,iscol = b->col; 305 PetscBool row_identity,col_identity; 306 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 311 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 312 B->offloadmask = PETSC_OFFLOAD_CPU; 313 /* determine which version of MatSolve needs to be used. */ 314 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 315 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 316 if (row_identity && col_identity) { 317 if (!cusparsestruct->use_cpu_solve) { 318 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 319 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 320 } 321 B->ops->matsolve = NULL; 322 B->ops->matsolvetranspose = NULL; 323 } else { 324 if (!cusparsestruct->use_cpu_solve) { 325 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 326 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 327 } 328 B->ops->matsolve = NULL; 329 B->ops->matsolvetranspose = NULL; 330 } 331 332 /* get the triangular factors */ 333 if (!cusparsestruct->use_cpu_solve) { 334 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 340 { 341 PetscErrorCode ierr; 342 MatCUSPARSEStorageFormat format; 343 PetscBool flg; 344 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 345 346 PetscFunctionBegin; 347 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 348 if (A->factortype == MAT_FACTOR_NONE) { 349 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 350 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 351 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 352 353 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 354 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 355 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 356 ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 357 if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 358 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 359 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 360 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 361 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 362 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 363 PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 364 #else 365 PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 366 #endif 367 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 368 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 369 PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 370 371 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 372 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 373 PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 374 #endif 375 } 376 ierr = PetscOptionsTail();CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 381 { 382 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 383 PetscErrorCode ierr; 384 385 PetscFunctionBegin; 386 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 387 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 388 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 389 PetscFunctionReturn(0); 390 } 391 392 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 393 { 394 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 395 PetscErrorCode ierr; 396 397 PetscFunctionBegin; 398 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 399 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 400 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 401 PetscFunctionReturn(0); 402 } 403 404 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 405 { 406 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 407 PetscErrorCode ierr; 408 409 PetscFunctionBegin; 410 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 411 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 412 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 413 PetscFunctionReturn(0); 414 } 415 416 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 417 { 418 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 419 PetscErrorCode ierr; 420 421 PetscFunctionBegin; 422 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 423 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 424 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 425 PetscFunctionReturn(0); 426 } 427 428 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 429 { 430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 431 PetscInt n = A->rmap->n; 432 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 433 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 434 cusparseStatus_t stat; 435 const PetscInt *ai = a->i,*aj = a->j,*vi; 436 const MatScalar *aa = a->a,*v; 437 PetscInt *AiLo, *AjLo; 438 PetscInt i,nz, nzLower, offset, rowOffset; 439 PetscErrorCode ierr; 440 cudaError_t cerr; 441 442 PetscFunctionBegin; 443 if (!n) PetscFunctionReturn(0); 444 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 445 try { 446 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 447 nzLower=n+ai[n]-ai[1]; 448 if (!loTriFactor) { 449 PetscScalar *AALo; 450 451 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 452 453 /* Allocate Space for the lower triangular matrix */ 454 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 455 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 456 457 /* Fill the lower triangular matrix */ 458 AiLo[0] = (PetscInt) 0; 459 AiLo[n] = nzLower; 460 AjLo[0] = (PetscInt) 0; 461 AALo[0] = (MatScalar) 1.0; 462 v = aa; 463 vi = aj; 464 offset = 1; 465 rowOffset= 1; 466 for (i=1; i<n; i++) { 467 nz = ai[i+1] - ai[i]; 468 /* additional 1 for the term on the diagonal */ 469 AiLo[i] = rowOffset; 470 rowOffset += nz+1; 471 472 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 473 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 474 475 offset += nz; 476 AjLo[offset] = (PetscInt) i; 477 AALo[offset] = (MatScalar) 1.0; 478 offset += 1; 479 480 v += nz; 481 vi += nz; 482 } 483 484 /* allocate space for the triangular factor information */ 485 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 486 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 487 /* Create the matrix description */ 488 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 489 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 490 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 491 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 492 #else 493 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 494 #endif 495 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 496 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 497 498 /* set the operation */ 499 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 500 501 /* set the matrix */ 502 loTriFactor->csrMat = new CsrMatrix; 503 loTriFactor->csrMat->num_rows = n; 504 loTriFactor->csrMat->num_cols = n; 505 loTriFactor->csrMat->num_entries = nzLower; 506 507 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 508 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 509 510 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 511 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 512 513 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 514 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 515 516 /* Create the solve analysis information */ 517 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 518 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 519 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 520 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 521 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 522 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 523 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 524 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 525 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 526 #endif 527 528 /* perform the solve analysis */ 529 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 530 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 531 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 532 loTriFactor->csrMat->column_indices->data().get(), 533 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 534 loTriFactor->solveInfo, 535 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 536 #else 537 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 538 #endif 539 cerr = WaitForCUDA();CHKERRCUDA(cerr); 540 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 541 542 /* assign the pointer */ 543 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 544 loTriFactor->AA_h = AALo; 545 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 546 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 547 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 548 } else { /* update values only */ 549 if (!loTriFactor->AA_h) { 550 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 551 } 552 /* Fill the lower triangular matrix */ 553 loTriFactor->AA_h[0] = 1.0; 554 v = aa; 555 vi = aj; 556 offset = 1; 557 for (i=1; i<n; i++) { 558 nz = ai[i+1] - ai[i]; 559 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 560 offset += nz; 561 loTriFactor->AA_h[offset] = 1.0; 562 offset += 1; 563 v += nz; 564 } 565 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 566 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 567 } 568 } catch(char *ex) { 569 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 570 } 571 } 572 PetscFunctionReturn(0); 573 } 574 575 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 576 { 577 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 578 PetscInt n = A->rmap->n; 579 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 580 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 581 cusparseStatus_t stat; 582 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 583 const MatScalar *aa = a->a,*v; 584 PetscInt *AiUp, *AjUp; 585 PetscInt i,nz, nzUpper, offset; 586 PetscErrorCode ierr; 587 cudaError_t cerr; 588 589 PetscFunctionBegin; 590 if (!n) PetscFunctionReturn(0); 591 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 592 try { 593 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 594 nzUpper = adiag[0]-adiag[n]; 595 if (!upTriFactor) { 596 PetscScalar *AAUp; 597 598 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 599 600 /* Allocate Space for the upper triangular matrix */ 601 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 602 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 603 604 /* Fill the upper triangular matrix */ 605 AiUp[0]=(PetscInt) 0; 606 AiUp[n]=nzUpper; 607 offset = nzUpper; 608 for (i=n-1; i>=0; i--) { 609 v = aa + adiag[i+1] + 1; 610 vi = aj + adiag[i+1] + 1; 611 612 /* number of elements NOT on the diagonal */ 613 nz = adiag[i] - adiag[i+1]-1; 614 615 /* decrement the offset */ 616 offset -= (nz+1); 617 618 /* first, set the diagonal elements */ 619 AjUp[offset] = (PetscInt) i; 620 AAUp[offset] = (MatScalar)1./v[nz]; 621 AiUp[i] = AiUp[i+1] - (nz+1); 622 623 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 624 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 625 } 626 627 /* allocate space for the triangular factor information */ 628 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 629 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 630 631 /* Create the matrix description */ 632 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 633 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 634 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 635 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 636 #else 637 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 638 #endif 639 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 640 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 641 642 /* set the operation */ 643 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 644 645 /* set the matrix */ 646 upTriFactor->csrMat = new CsrMatrix; 647 upTriFactor->csrMat->num_rows = n; 648 upTriFactor->csrMat->num_cols = n; 649 upTriFactor->csrMat->num_entries = nzUpper; 650 651 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 652 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 653 654 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 655 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 656 657 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 658 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 659 660 /* Create the solve analysis information */ 661 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 662 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 663 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 664 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 665 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 666 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 667 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 668 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 669 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 670 #endif 671 672 /* perform the solve analysis */ 673 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 674 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 675 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 676 upTriFactor->csrMat->column_indices->data().get(), 677 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 678 upTriFactor->solveInfo, 679 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 680 #else 681 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 682 #endif 683 cerr = WaitForCUDA();CHKERRCUDA(cerr); 684 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 685 686 /* assign the pointer */ 687 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 688 upTriFactor->AA_h = AAUp; 689 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 690 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 691 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 692 } else { 693 if (!upTriFactor->AA_h) { 694 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 695 } 696 /* Fill the upper triangular matrix */ 697 offset = nzUpper; 698 for (i=n-1; i>=0; i--) { 699 v = aa + adiag[i+1] + 1; 700 701 /* number of elements NOT on the diagonal */ 702 nz = adiag[i] - adiag[i+1]-1; 703 704 /* decrement the offset */ 705 offset -= (nz+1); 706 707 /* first, set the diagonal elements */ 708 upTriFactor->AA_h[offset] = 1./v[nz]; 709 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 710 } 711 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 712 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 713 } 714 } catch(char *ex) { 715 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 716 } 717 } 718 PetscFunctionReturn(0); 719 } 720 721 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 722 { 723 PetscErrorCode ierr; 724 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 725 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 726 IS isrow = a->row,iscol = a->icol; 727 PetscBool row_identity,col_identity; 728 PetscInt n = A->rmap->n; 729 730 PetscFunctionBegin; 731 PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 732 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 733 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 734 735 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 736 cusparseTriFactors->nnz=a->nz; 737 738 A->offloadmask = PETSC_OFFLOAD_BOTH; 739 /* lower triangular indices */ 740 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 741 if (!row_identity && !cusparseTriFactors->rpermIndices) { 742 const PetscInt *r; 743 744 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 745 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 746 cusparseTriFactors->rpermIndices->assign(r, r+n); 747 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 748 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 749 } 750 751 /* upper triangular indices */ 752 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 753 if (!col_identity && !cusparseTriFactors->cpermIndices) { 754 const PetscInt *c; 755 756 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 757 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 758 cusparseTriFactors->cpermIndices->assign(c, c+n); 759 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 760 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 761 } 762 PetscFunctionReturn(0); 763 } 764 765 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 766 { 767 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 768 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 769 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 770 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 771 cusparseStatus_t stat; 772 PetscErrorCode ierr; 773 cudaError_t cerr; 774 PetscInt *AiUp, *AjUp; 775 PetscScalar *AAUp; 776 PetscScalar *AALo; 777 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 778 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 779 const PetscInt *ai = b->i,*aj = b->j,*vj; 780 const MatScalar *aa = b->a,*v; 781 782 PetscFunctionBegin; 783 if (!n) PetscFunctionReturn(0); 784 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 785 try { 786 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 787 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 788 if (!upTriFactor && !loTriFactor) { 789 /* Allocate Space for the upper triangular matrix */ 790 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 791 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 792 793 /* Fill the upper triangular matrix */ 794 AiUp[0]=(PetscInt) 0; 795 AiUp[n]=nzUpper; 796 offset = 0; 797 for (i=0; i<n; i++) { 798 /* set the pointers */ 799 v = aa + ai[i]; 800 vj = aj + ai[i]; 801 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 802 803 /* first, set the diagonal elements */ 804 AjUp[offset] = (PetscInt) i; 805 AAUp[offset] = (MatScalar)1.0/v[nz]; 806 AiUp[i] = offset; 807 AALo[offset] = (MatScalar)1.0/v[nz]; 808 809 offset+=1; 810 if (nz>0) { 811 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 812 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 813 for (j=offset; j<offset+nz; j++) { 814 AAUp[j] = -AAUp[j]; 815 AALo[j] = AAUp[j]/v[nz]; 816 } 817 offset+=nz; 818 } 819 } 820 821 /* allocate space for the triangular factor information */ 822 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 823 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 824 825 /* Create the matrix description */ 826 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 827 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 828 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 829 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 830 #else 831 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 832 #endif 833 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 834 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 835 836 /* set the matrix */ 837 upTriFactor->csrMat = new CsrMatrix; 838 upTriFactor->csrMat->num_rows = A->rmap->n; 839 upTriFactor->csrMat->num_cols = A->cmap->n; 840 upTriFactor->csrMat->num_entries = a->nz; 841 842 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 843 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 844 845 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 846 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 847 848 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 849 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 850 851 /* set the operation */ 852 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 853 854 /* Create the solve analysis information */ 855 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 856 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 857 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 858 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 859 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 860 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 861 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 862 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 863 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 864 #endif 865 866 /* perform the solve analysis */ 867 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 868 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 869 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 870 upTriFactor->csrMat->column_indices->data().get(), 871 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 872 upTriFactor->solveInfo, 873 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 874 #else 875 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 876 #endif 877 cerr = WaitForCUDA();CHKERRCUDA(cerr); 878 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 879 880 /* assign the pointer */ 881 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 882 883 /* allocate space for the triangular factor information */ 884 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 885 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 886 887 /* Create the matrix description */ 888 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 889 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 890 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 891 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 892 #else 893 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 894 #endif 895 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 896 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 897 898 /* set the operation */ 899 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 900 901 /* set the matrix */ 902 loTriFactor->csrMat = new CsrMatrix; 903 loTriFactor->csrMat->num_rows = A->rmap->n; 904 loTriFactor->csrMat->num_cols = A->cmap->n; 905 loTriFactor->csrMat->num_entries = a->nz; 906 907 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 908 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 909 910 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 911 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 912 913 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 914 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 915 916 /* Create the solve analysis information */ 917 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 918 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 919 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 920 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 921 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 922 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 923 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 924 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 925 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 926 #endif 927 928 /* perform the solve analysis */ 929 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 930 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 931 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 932 loTriFactor->csrMat->column_indices->data().get(), 933 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 934 loTriFactor->solveInfo, 935 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 936 #else 937 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 938 #endif 939 cerr = WaitForCUDA();CHKERRCUDA(cerr); 940 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 941 942 /* assign the pointer */ 943 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 944 945 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 946 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 947 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 948 } else { 949 /* Fill the upper triangular matrix */ 950 offset = 0; 951 for (i=0; i<n; i++) { 952 /* set the pointers */ 953 v = aa + ai[i]; 954 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 955 956 /* first, set the diagonal elements */ 957 AAUp[offset] = 1.0/v[nz]; 958 AALo[offset] = 1.0/v[nz]; 959 960 offset+=1; 961 if (nz>0) { 962 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 963 for (j=offset; j<offset+nz; j++) { 964 AAUp[j] = -AAUp[j]; 965 AALo[j] = AAUp[j]/v[nz]; 966 } 967 offset+=nz; 968 } 969 } 970 PetscCheckFalse(!upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 971 PetscCheckFalse(!loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 972 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 973 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 974 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 975 } 976 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 977 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 978 } catch(char *ex) { 979 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 980 } 981 } 982 PetscFunctionReturn(0); 983 } 984 985 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 986 { 987 PetscErrorCode ierr; 988 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 989 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 990 IS ip = a->row; 991 PetscBool perm_identity; 992 PetscInt n = A->rmap->n; 993 994 PetscFunctionBegin; 995 PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 996 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 997 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 998 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 999 1000 A->offloadmask = PETSC_OFFLOAD_BOTH; 1001 1002 /* lower triangular indices */ 1003 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1004 if (!perm_identity) { 1005 IS iip; 1006 const PetscInt *irip,*rip; 1007 1008 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 1009 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1010 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1011 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1012 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1013 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1014 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 1015 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 1016 ierr = ISDestroy(&iip);CHKERRQ(ierr); 1017 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1018 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1019 } 1020 PetscFunctionReturn(0); 1021 } 1022 1023 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1024 { 1025 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1026 IS ip = b->row; 1027 PetscBool perm_identity; 1028 PetscErrorCode ierr; 1029 1030 PetscFunctionBegin; 1031 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1032 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1033 B->offloadmask = PETSC_OFFLOAD_CPU; 1034 /* determine which version of MatSolve needs to be used. */ 1035 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1036 if (perm_identity) { 1037 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1038 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1039 B->ops->matsolve = NULL; 1040 B->ops->matsolvetranspose = NULL; 1041 } else { 1042 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1043 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1044 B->ops->matsolve = NULL; 1045 B->ops->matsolvetranspose = NULL; 1046 } 1047 1048 /* get the triangular factors */ 1049 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 1053 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1054 { 1055 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1056 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1057 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1058 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1059 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1060 cusparseStatus_t stat; 1061 cusparseIndexBase_t indexBase; 1062 cusparseMatrixType_t matrixType; 1063 cusparseFillMode_t fillMode; 1064 cusparseDiagType_t diagType; 1065 cudaError_t cerr; 1066 PetscErrorCode ierr; 1067 1068 PetscFunctionBegin; 1069 /* allocate space for the transpose of the lower triangular factor */ 1070 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1071 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1072 1073 /* set the matrix descriptors of the lower triangular factor */ 1074 matrixType = cusparseGetMatType(loTriFactor->descr); 1075 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1076 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1077 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1078 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1079 1080 /* Create the matrix description */ 1081 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1082 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1083 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1084 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1085 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1086 1087 /* set the operation */ 1088 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1089 1090 /* allocate GPU space for the CSC of the lower triangular factor*/ 1091 loTriFactorT->csrMat = new CsrMatrix; 1092 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1093 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1094 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1095 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1096 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1097 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1098 1099 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1100 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1101 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1102 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1103 loTriFactor->csrMat->values->data().get(), 1104 loTriFactor->csrMat->row_offsets->data().get(), 1105 loTriFactor->csrMat->column_indices->data().get(), 1106 loTriFactorT->csrMat->values->data().get(), 1107 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1108 CUSPARSE_ACTION_NUMERIC,indexBase, 1109 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1110 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1111 #endif 1112 1113 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1114 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1115 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1116 loTriFactor->csrMat->values->data().get(), 1117 loTriFactor->csrMat->row_offsets->data().get(), 1118 loTriFactor->csrMat->column_indices->data().get(), 1119 loTriFactorT->csrMat->values->data().get(), 1120 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1121 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1122 CUSPARSE_ACTION_NUMERIC, indexBase, 1123 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1124 #else 1125 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1126 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1127 #endif 1128 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1129 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1130 1131 /* Create the solve analysis information */ 1132 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1133 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1134 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1135 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1136 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1137 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1138 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1139 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1140 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1141 #endif 1142 1143 /* perform the solve analysis */ 1144 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1145 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1146 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1147 loTriFactorT->csrMat->column_indices->data().get(), 1148 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1149 loTriFactorT->solveInfo, 1150 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1151 #else 1152 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1153 #endif 1154 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1155 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1156 1157 /* assign the pointer */ 1158 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1159 1160 /*********************************************/ 1161 /* Now the Transpose of the Upper Tri Factor */ 1162 /*********************************************/ 1163 1164 /* allocate space for the transpose of the upper triangular factor */ 1165 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1166 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1167 1168 /* set the matrix descriptors of the upper triangular factor */ 1169 matrixType = cusparseGetMatType(upTriFactor->descr); 1170 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1171 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1172 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1173 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1174 1175 /* Create the matrix description */ 1176 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1177 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1178 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1179 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1180 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1181 1182 /* set the operation */ 1183 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1184 1185 /* allocate GPU space for the CSC of the upper triangular factor*/ 1186 upTriFactorT->csrMat = new CsrMatrix; 1187 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1188 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1189 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1190 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1191 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1192 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1193 1194 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1195 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1196 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1197 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1198 upTriFactor->csrMat->values->data().get(), 1199 upTriFactor->csrMat->row_offsets->data().get(), 1200 upTriFactor->csrMat->column_indices->data().get(), 1201 upTriFactorT->csrMat->values->data().get(), 1202 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1203 CUSPARSE_ACTION_NUMERIC,indexBase, 1204 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1205 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1206 #endif 1207 1208 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1209 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1210 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1211 upTriFactor->csrMat->values->data().get(), 1212 upTriFactor->csrMat->row_offsets->data().get(), 1213 upTriFactor->csrMat->column_indices->data().get(), 1214 upTriFactorT->csrMat->values->data().get(), 1215 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1216 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1217 CUSPARSE_ACTION_NUMERIC, indexBase, 1218 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1219 #else 1220 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1221 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1222 #endif 1223 1224 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1225 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1226 1227 /* Create the solve analysis information */ 1228 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1229 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1230 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1231 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1232 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1233 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1234 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1235 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1236 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1237 #endif 1238 1239 /* perform the solve analysis */ 1240 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1241 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1242 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1243 upTriFactorT->csrMat->column_indices->data().get(), 1244 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1245 upTriFactorT->solveInfo, 1246 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1247 #else 1248 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1249 #endif 1250 1251 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1252 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1253 1254 /* assign the pointer */ 1255 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1256 PetscFunctionReturn(0); 1257 } 1258 1259 struct PetscScalarToPetscInt 1260 { 1261 __host__ __device__ 1262 PetscInt operator()(PetscScalar s) 1263 { 1264 return (PetscInt)PetscRealPart(s); 1265 } 1266 }; 1267 1268 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1269 { 1270 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1271 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1273 cusparseStatus_t stat; 1274 cusparseIndexBase_t indexBase; 1275 cudaError_t err; 1276 PetscErrorCode ierr; 1277 1278 PetscFunctionBegin; 1279 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1280 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1281 PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1282 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1283 PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1284 if (A->transupdated) PetscFunctionReturn(0); 1285 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1286 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1287 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1288 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1289 } 1290 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1291 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1292 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1293 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1294 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1295 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1296 1297 /* set alpha and beta */ 1298 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1299 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1300 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1301 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1302 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1303 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1304 1305 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1306 CsrMatrix *matrixT = new CsrMatrix; 1307 matstructT->mat = matrixT; 1308 matrixT->num_rows = A->cmap->n; 1309 matrixT->num_cols = A->rmap->n; 1310 matrixT->num_entries = a->nz; 1311 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1312 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1313 matrixT->values = new THRUSTARRAY(a->nz); 1314 1315 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1316 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1317 1318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1319 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1320 stat = cusparseCreateCsr(&matstructT->matDescr, 1321 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1322 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1323 matrixT->values->data().get(), 1324 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1325 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1326 #else 1327 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1328 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1329 1330 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1331 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1332 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1333 */ 1334 if (matrixT->num_entries) { 1335 stat = cusparseCreateCsr(&matstructT->matDescr, 1336 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1337 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1338 matrixT->values->data().get(), 1339 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1340 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1341 1342 } else { 1343 matstructT->matDescr = NULL; 1344 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1345 } 1346 #endif 1347 #endif 1348 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1349 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1350 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1351 #else 1352 CsrMatrix *temp = new CsrMatrix; 1353 CsrMatrix *tempT = new CsrMatrix; 1354 /* First convert HYB to CSR */ 1355 temp->num_rows = A->rmap->n; 1356 temp->num_cols = A->cmap->n; 1357 temp->num_entries = a->nz; 1358 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1359 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1360 temp->values = new THRUSTARRAY(a->nz); 1361 1362 stat = cusparse_hyb2csr(cusparsestruct->handle, 1363 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1364 temp->values->data().get(), 1365 temp->row_offsets->data().get(), 1366 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1367 1368 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1369 tempT->num_rows = A->rmap->n; 1370 tempT->num_cols = A->cmap->n; 1371 tempT->num_entries = a->nz; 1372 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1373 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1374 tempT->values = new THRUSTARRAY(a->nz); 1375 1376 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1377 temp->num_cols, temp->num_entries, 1378 temp->values->data().get(), 1379 temp->row_offsets->data().get(), 1380 temp->column_indices->data().get(), 1381 tempT->values->data().get(), 1382 tempT->column_indices->data().get(), 1383 tempT->row_offsets->data().get(), 1384 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1385 1386 /* Last, convert CSC to HYB */ 1387 cusparseHybMat_t hybMat; 1388 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1389 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1390 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1391 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1392 matstructT->descr, tempT->values->data().get(), 1393 tempT->row_offsets->data().get(), 1394 tempT->column_indices->data().get(), 1395 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1396 1397 /* assign the pointer */ 1398 matstructT->mat = hybMat; 1399 A->transupdated = PETSC_TRUE; 1400 /* delete temporaries */ 1401 if (tempT) { 1402 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1403 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1404 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1405 delete (CsrMatrix*) tempT; 1406 } 1407 if (temp) { 1408 if (temp->values) delete (THRUSTARRAY*) temp->values; 1409 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1410 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1411 delete (CsrMatrix*) temp; 1412 } 1413 #endif 1414 } 1415 } 1416 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1417 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1418 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1419 PetscCheckFalse(!matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1420 PetscCheckFalse(!matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1421 PetscCheckFalse(!matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1422 PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1423 PetscCheckFalse(!matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1424 PetscCheckFalse(!matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1425 PetscCheckFalse(!matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1426 PetscCheckFalse(!matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1427 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1428 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1429 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1430 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1431 } 1432 if (!cusparsestruct->csr2csc_i) { 1433 THRUSTARRAY csr2csc_a(matrix->num_entries); 1434 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1435 1436 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1438 void *csr2cscBuffer; 1439 size_t csr2cscBufferSize; 1440 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1441 A->cmap->n, matrix->num_entries, 1442 matrix->values->data().get(), 1443 cusparsestruct->rowoffsets_gpu->data().get(), 1444 matrix->column_indices->data().get(), 1445 matrixT->values->data().get(), 1446 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1447 CUSPARSE_ACTION_NUMERIC,indexBase, 1448 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1449 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1450 #endif 1451 1452 if (matrix->num_entries) { 1453 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1454 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1455 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1456 1457 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1458 should be filled with indexBase. So I just take a shortcut here. 1459 */ 1460 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1461 A->cmap->n,matrix->num_entries, 1462 csr2csc_a.data().get(), 1463 cusparsestruct->rowoffsets_gpu->data().get(), 1464 matrix->column_indices->data().get(), 1465 matrixT->values->data().get(), 1466 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1467 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1468 CUSPARSE_ACTION_NUMERIC,indexBase, 1469 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1470 #else 1471 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1472 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1473 #endif 1474 } else { 1475 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1476 } 1477 1478 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1479 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1480 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1481 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1482 #endif 1483 } 1484 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1485 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1486 matrixT->values->begin())); 1487 } 1488 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1489 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1490 /* the compressed row indices is not used for matTranspose */ 1491 matstructT->cprowIndices = NULL; 1492 /* assign the pointer */ 1493 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1494 A->transupdated = PETSC_TRUE; 1495 PetscFunctionReturn(0); 1496 } 1497 1498 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1499 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1500 { 1501 PetscInt n = xx->map->n; 1502 const PetscScalar *barray; 1503 PetscScalar *xarray; 1504 thrust::device_ptr<const PetscScalar> bGPU; 1505 thrust::device_ptr<PetscScalar> xGPU; 1506 cusparseStatus_t stat; 1507 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1508 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1510 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1511 PetscErrorCode ierr; 1512 1513 PetscFunctionBegin; 1514 /* Analyze the matrix and create the transpose ... on the fly */ 1515 if (!loTriFactorT && !upTriFactorT) { 1516 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1517 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1518 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1519 } 1520 1521 /* Get the GPU pointers */ 1522 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1523 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1524 xGPU = thrust::device_pointer_cast(xarray); 1525 bGPU = thrust::device_pointer_cast(barray); 1526 1527 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1528 /* First, reorder with the row permutation */ 1529 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1530 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1531 xGPU); 1532 1533 /* First, solve U */ 1534 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1535 upTriFactorT->csrMat->num_rows, 1536 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1537 upTriFactorT->csrMat->num_entries, 1538 #endif 1539 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1540 upTriFactorT->csrMat->values->data().get(), 1541 upTriFactorT->csrMat->row_offsets->data().get(), 1542 upTriFactorT->csrMat->column_indices->data().get(), 1543 upTriFactorT->solveInfo, 1544 xarray, 1545 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1546 tempGPU->data().get(), 1547 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1548 #else 1549 tempGPU->data().get());CHKERRCUSPARSE(stat); 1550 #endif 1551 1552 /* Then, solve L */ 1553 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1554 loTriFactorT->csrMat->num_rows, 1555 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1556 loTriFactorT->csrMat->num_entries, 1557 #endif 1558 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1559 loTriFactorT->csrMat->values->data().get(), 1560 loTriFactorT->csrMat->row_offsets->data().get(), 1561 loTriFactorT->csrMat->column_indices->data().get(), 1562 loTriFactorT->solveInfo, 1563 tempGPU->data().get(), 1564 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1565 xarray, 1566 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1567 #else 1568 xarray);CHKERRCUSPARSE(stat); 1569 #endif 1570 1571 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1572 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1573 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1574 tempGPU->begin()); 1575 1576 /* Copy the temporary to the full solution. */ 1577 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1578 1579 /* restore */ 1580 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1581 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1582 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1583 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1584 PetscFunctionReturn(0); 1585 } 1586 1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1588 { 1589 const PetscScalar *barray; 1590 PetscScalar *xarray; 1591 cusparseStatus_t stat; 1592 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1594 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1595 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1596 PetscErrorCode ierr; 1597 1598 PetscFunctionBegin; 1599 /* Analyze the matrix and create the transpose ... on the fly */ 1600 if (!loTriFactorT && !upTriFactorT) { 1601 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1602 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1603 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1604 } 1605 1606 /* Get the GPU pointers */ 1607 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1608 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1609 1610 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1611 /* First, solve U */ 1612 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1613 upTriFactorT->csrMat->num_rows, 1614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1615 upTriFactorT->csrMat->num_entries, 1616 #endif 1617 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1618 upTriFactorT->csrMat->values->data().get(), 1619 upTriFactorT->csrMat->row_offsets->data().get(), 1620 upTriFactorT->csrMat->column_indices->data().get(), 1621 upTriFactorT->solveInfo, 1622 barray, 1623 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1624 tempGPU->data().get(), 1625 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1626 #else 1627 tempGPU->data().get());CHKERRCUSPARSE(stat); 1628 #endif 1629 1630 /* Then, solve L */ 1631 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1632 loTriFactorT->csrMat->num_rows, 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 loTriFactorT->csrMat->num_entries, 1635 #endif 1636 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1637 loTriFactorT->csrMat->values->data().get(), 1638 loTriFactorT->csrMat->row_offsets->data().get(), 1639 loTriFactorT->csrMat->column_indices->data().get(), 1640 loTriFactorT->solveInfo, 1641 tempGPU->data().get(), 1642 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1643 xarray, 1644 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1645 #else 1646 xarray);CHKERRCUSPARSE(stat); 1647 #endif 1648 1649 /* restore */ 1650 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1651 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1652 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1653 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1654 PetscFunctionReturn(0); 1655 } 1656 1657 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1658 { 1659 const PetscScalar *barray; 1660 PetscScalar *xarray; 1661 thrust::device_ptr<const PetscScalar> bGPU; 1662 thrust::device_ptr<PetscScalar> xGPU; 1663 cusparseStatus_t stat; 1664 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1665 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1666 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1667 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1668 PetscErrorCode ierr; 1669 1670 PetscFunctionBegin; 1671 1672 /* Get the GPU pointers */ 1673 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1674 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1675 xGPU = thrust::device_pointer_cast(xarray); 1676 bGPU = thrust::device_pointer_cast(barray); 1677 1678 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1679 /* First, reorder with the row permutation */ 1680 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1681 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1682 tempGPU->begin()); 1683 1684 /* Next, solve L */ 1685 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1686 loTriFactor->csrMat->num_rows, 1687 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1688 loTriFactor->csrMat->num_entries, 1689 #endif 1690 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1691 loTriFactor->csrMat->values->data().get(), 1692 loTriFactor->csrMat->row_offsets->data().get(), 1693 loTriFactor->csrMat->column_indices->data().get(), 1694 loTriFactor->solveInfo, 1695 tempGPU->data().get(), 1696 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1697 xarray, 1698 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1699 #else 1700 xarray);CHKERRCUSPARSE(stat); 1701 #endif 1702 1703 /* Then, solve U */ 1704 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1705 upTriFactor->csrMat->num_rows, 1706 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1707 upTriFactor->csrMat->num_entries, 1708 #endif 1709 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1710 upTriFactor->csrMat->values->data().get(), 1711 upTriFactor->csrMat->row_offsets->data().get(), 1712 upTriFactor->csrMat->column_indices->data().get(), 1713 upTriFactor->solveInfo,xarray, 1714 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1715 tempGPU->data().get(), 1716 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1717 #else 1718 tempGPU->data().get());CHKERRCUSPARSE(stat); 1719 #endif 1720 1721 /* Last, reorder with the column permutation */ 1722 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1723 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1724 xGPU); 1725 1726 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1727 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1728 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1729 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1730 PetscFunctionReturn(0); 1731 } 1732 1733 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1734 { 1735 const PetscScalar *barray; 1736 PetscScalar *xarray; 1737 cusparseStatus_t stat; 1738 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1739 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1740 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1741 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1742 PetscErrorCode ierr; 1743 1744 PetscFunctionBegin; 1745 /* Get the GPU pointers */ 1746 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1747 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1748 1749 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1750 /* First, solve L */ 1751 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1752 loTriFactor->csrMat->num_rows, 1753 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1754 loTriFactor->csrMat->num_entries, 1755 #endif 1756 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1757 loTriFactor->csrMat->values->data().get(), 1758 loTriFactor->csrMat->row_offsets->data().get(), 1759 loTriFactor->csrMat->column_indices->data().get(), 1760 loTriFactor->solveInfo, 1761 barray, 1762 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1763 tempGPU->data().get(), 1764 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1765 #else 1766 tempGPU->data().get());CHKERRCUSPARSE(stat); 1767 #endif 1768 1769 /* Next, solve U */ 1770 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1771 upTriFactor->csrMat->num_rows, 1772 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1773 upTriFactor->csrMat->num_entries, 1774 #endif 1775 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1776 upTriFactor->csrMat->values->data().get(), 1777 upTriFactor->csrMat->row_offsets->data().get(), 1778 upTriFactor->csrMat->column_indices->data().get(), 1779 upTriFactor->solveInfo, 1780 tempGPU->data().get(), 1781 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1782 xarray, 1783 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1784 #else 1785 xarray);CHKERRCUSPARSE(stat); 1786 #endif 1787 1788 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1789 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1790 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1791 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1792 PetscFunctionReturn(0); 1793 } 1794 1795 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1796 { 1797 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1798 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1799 cudaError_t cerr; 1800 PetscErrorCode ierr; 1801 1802 PetscFunctionBegin; 1803 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1804 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1805 1806 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1807 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1808 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1809 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1810 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1811 A->offloadmask = PETSC_OFFLOAD_BOTH; 1812 } 1813 PetscFunctionReturn(0); 1814 } 1815 1816 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1817 { 1818 PetscErrorCode ierr; 1819 1820 PetscFunctionBegin; 1821 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1822 *array = ((Mat_SeqAIJ*)A->data)->a; 1823 PetscFunctionReturn(0); 1824 } 1825 1826 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1827 { 1828 PetscFunctionBegin; 1829 A->offloadmask = PETSC_OFFLOAD_CPU; 1830 *array = NULL; 1831 PetscFunctionReturn(0); 1832 } 1833 1834 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1835 { 1836 PetscErrorCode ierr; 1837 1838 PetscFunctionBegin; 1839 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1840 *array = ((Mat_SeqAIJ*)A->data)->a; 1841 PetscFunctionReturn(0); 1842 } 1843 1844 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1845 { 1846 PetscFunctionBegin; 1847 *array = NULL; 1848 PetscFunctionReturn(0); 1849 } 1850 1851 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1852 { 1853 PetscFunctionBegin; 1854 *array = ((Mat_SeqAIJ*)A->data)->a; 1855 PetscFunctionReturn(0); 1856 } 1857 1858 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1859 { 1860 PetscFunctionBegin; 1861 A->offloadmask = PETSC_OFFLOAD_CPU; 1862 *array = NULL; 1863 PetscFunctionReturn(0); 1864 } 1865 1866 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1867 { 1868 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1869 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1870 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1871 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1872 PetscErrorCode ierr; 1873 cusparseStatus_t stat; 1874 PetscBool both = PETSC_TRUE; 1875 cudaError_t err; 1876 1877 PetscFunctionBegin; 1878 PetscCheckFalse(A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1879 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1880 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1881 CsrMatrix *matrix; 1882 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1883 1884 PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1885 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1886 matrix->values->assign(a->a, a->a+a->nz); 1887 err = WaitForCUDA();CHKERRCUDA(err); 1888 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1889 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1890 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1891 } else { 1892 PetscInt nnz; 1893 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1894 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1895 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1896 delete cusparsestruct->workVector; 1897 delete cusparsestruct->rowoffsets_gpu; 1898 cusparsestruct->workVector = NULL; 1899 cusparsestruct->rowoffsets_gpu = NULL; 1900 try { 1901 if (a->compressedrow.use) { 1902 m = a->compressedrow.nrows; 1903 ii = a->compressedrow.i; 1904 ridx = a->compressedrow.rindex; 1905 } else { 1906 m = A->rmap->n; 1907 ii = a->i; 1908 ridx = NULL; 1909 } 1910 PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1911 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1912 else nnz = a->nz; 1913 PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1914 1915 /* create cusparse matrix */ 1916 cusparsestruct->nrows = m; 1917 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1918 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1919 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1920 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1921 1922 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1923 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1924 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1925 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1926 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1927 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1928 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1929 1930 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1931 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1932 /* set the matrix */ 1933 CsrMatrix *mat= new CsrMatrix; 1934 mat->num_rows = m; 1935 mat->num_cols = A->cmap->n; 1936 mat->num_entries = nnz; 1937 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1938 mat->row_offsets->assign(ii, ii + m+1); 1939 1940 mat->column_indices = new THRUSTINTARRAY32(nnz); 1941 mat->column_indices->assign(a->j, a->j+nnz); 1942 1943 mat->values = new THRUSTARRAY(nnz); 1944 if (a->a) mat->values->assign(a->a, a->a+nnz); 1945 1946 /* assign the pointer */ 1947 matstruct->mat = mat; 1948 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1949 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1950 stat = cusparseCreateCsr(&matstruct->matDescr, 1951 mat->num_rows, mat->num_cols, mat->num_entries, 1952 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1953 mat->values->data().get(), 1954 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1955 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1956 } 1957 #endif 1958 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1959 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1960 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1961 #else 1962 CsrMatrix *mat= new CsrMatrix; 1963 mat->num_rows = m; 1964 mat->num_cols = A->cmap->n; 1965 mat->num_entries = nnz; 1966 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1967 mat->row_offsets->assign(ii, ii + m+1); 1968 1969 mat->column_indices = new THRUSTINTARRAY32(nnz); 1970 mat->column_indices->assign(a->j, a->j+nnz); 1971 1972 mat->values = new THRUSTARRAY(nnz); 1973 if (a->a) mat->values->assign(a->a, a->a+nnz); 1974 1975 cusparseHybMat_t hybMat; 1976 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1977 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1978 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1979 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1980 matstruct->descr, mat->values->data().get(), 1981 mat->row_offsets->data().get(), 1982 mat->column_indices->data().get(), 1983 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1984 /* assign the pointer */ 1985 matstruct->mat = hybMat; 1986 1987 if (mat) { 1988 if (mat->values) delete (THRUSTARRAY*)mat->values; 1989 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1990 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1991 delete (CsrMatrix*)mat; 1992 } 1993 #endif 1994 } 1995 1996 /* assign the compressed row indices */ 1997 if (a->compressedrow.use) { 1998 cusparsestruct->workVector = new THRUSTARRAY(m); 1999 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2000 matstruct->cprowIndices->assign(ridx,ridx+m); 2001 tmp = m; 2002 } else { 2003 cusparsestruct->workVector = NULL; 2004 matstruct->cprowIndices = NULL; 2005 tmp = 0; 2006 } 2007 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2008 2009 /* assign the pointer */ 2010 cusparsestruct->mat = matstruct; 2011 } catch(char *ex) { 2012 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2013 } 2014 err = WaitForCUDA();CHKERRCUDA(err); 2015 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 2016 cusparsestruct->nonzerostate = A->nonzerostate; 2017 } 2018 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2019 } 2020 PetscFunctionReturn(0); 2021 } 2022 2023 struct VecCUDAPlusEquals 2024 { 2025 template <typename Tuple> 2026 __host__ __device__ 2027 void operator()(Tuple t) 2028 { 2029 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2030 } 2031 }; 2032 2033 struct VecCUDAEquals 2034 { 2035 template <typename Tuple> 2036 __host__ __device__ 2037 void operator()(Tuple t) 2038 { 2039 thrust::get<1>(t) = thrust::get<0>(t); 2040 } 2041 }; 2042 2043 struct VecCUDAEqualsReverse 2044 { 2045 template <typename Tuple> 2046 __host__ __device__ 2047 void operator()(Tuple t) 2048 { 2049 thrust::get<0>(t) = thrust::get<1>(t); 2050 } 2051 }; 2052 2053 struct MatMatCusparse { 2054 PetscBool cisdense; 2055 PetscScalar *Bt; 2056 Mat X; 2057 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2058 PetscLogDouble flops; 2059 CsrMatrix *Bcsr; 2060 2061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2062 cusparseSpMatDescr_t matSpBDescr; 2063 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2064 cusparseDnMatDescr_t matBDescr; 2065 cusparseDnMatDescr_t matCDescr; 2066 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2067 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2068 void *dBuffer4; 2069 void *dBuffer5; 2070 #endif 2071 size_t mmBufferSize; 2072 void *mmBuffer; 2073 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2074 cusparseSpGEMMDescr_t spgemmDesc; 2075 #endif 2076 }; 2077 2078 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2079 { 2080 PetscErrorCode ierr; 2081 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2082 cudaError_t cerr; 2083 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2084 cusparseStatus_t stat; 2085 #endif 2086 2087 PetscFunctionBegin; 2088 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2089 delete mmdata->Bcsr; 2090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2091 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2092 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2093 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2094 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2095 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2096 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2097 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2098 #endif 2099 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2100 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2101 #endif 2102 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2103 ierr = PetscFree(data);CHKERRQ(ierr); 2104 PetscFunctionReturn(0); 2105 } 2106 2107 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2108 2109 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2110 { 2111 Mat_Product *product = C->product; 2112 Mat A,B; 2113 PetscInt m,n,blda,clda; 2114 PetscBool flg,biscuda; 2115 Mat_SeqAIJCUSPARSE *cusp; 2116 cusparseStatus_t stat; 2117 cusparseOperation_t opA; 2118 const PetscScalar *barray; 2119 PetscScalar *carray; 2120 PetscErrorCode ierr; 2121 MatMatCusparse *mmdata; 2122 Mat_SeqAIJCUSPARSEMultStruct *mat; 2123 CsrMatrix *csrmat; 2124 2125 PetscFunctionBegin; 2126 MatCheckProduct(C,1); 2127 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2128 mmdata = (MatMatCusparse*)product->data; 2129 A = product->A; 2130 B = product->B; 2131 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2132 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2133 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2134 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2135 PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2136 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2137 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2138 switch (product->type) { 2139 case MATPRODUCT_AB: 2140 case MATPRODUCT_PtAP: 2141 mat = cusp->mat; 2142 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2143 m = A->rmap->n; 2144 n = B->cmap->n; 2145 break; 2146 case MATPRODUCT_AtB: 2147 if (!A->form_explicit_transpose) { 2148 mat = cusp->mat; 2149 opA = CUSPARSE_OPERATION_TRANSPOSE; 2150 } else { 2151 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2152 mat = cusp->matTranspose; 2153 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2154 } 2155 m = A->cmap->n; 2156 n = B->cmap->n; 2157 break; 2158 case MATPRODUCT_ABt: 2159 case MATPRODUCT_RARt: 2160 mat = cusp->mat; 2161 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2162 m = A->rmap->n; 2163 n = B->rmap->n; 2164 break; 2165 default: 2166 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2167 } 2168 PetscCheckFalse(!mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2169 csrmat = (CsrMatrix*)mat->mat; 2170 /* if the user passed a CPU matrix, copy the data to the GPU */ 2171 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2172 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2173 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2174 2175 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2176 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2177 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2178 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2179 } else { 2180 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2181 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2182 } 2183 2184 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2185 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2186 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2187 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2188 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2189 size_t mmBufferSize; 2190 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2191 if (!mmdata->matBDescr) { 2192 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2193 mmdata->Blda = blda; 2194 } 2195 2196 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2197 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2198 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2199 mmdata->Clda = clda; 2200 } 2201 2202 if (!mat->matDescr) { 2203 stat = cusparseCreateCsr(&mat->matDescr, 2204 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2205 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2206 csrmat->values->data().get(), 2207 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2208 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2209 } 2210 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2211 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2212 mmdata->matCDescr,cusparse_scalartype, 2213 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2214 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2215 cudaError_t cerr; 2216 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2217 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2218 mmdata->mmBufferSize = mmBufferSize; 2219 } 2220 mmdata->initialized = PETSC_TRUE; 2221 } else { 2222 /* to be safe, always update pointers of the mats */ 2223 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2224 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2225 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2226 } 2227 2228 /* do cusparseSpMM, which supports transpose on B */ 2229 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2230 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2231 mmdata->matCDescr,cusparse_scalartype, 2232 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2233 #else 2234 PetscInt k; 2235 /* cusparseXcsrmm does not support transpose on B */ 2236 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2237 cublasHandle_t cublasv2handle; 2238 cublasStatus_t cerr; 2239 2240 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2241 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2242 B->cmap->n,B->rmap->n, 2243 &PETSC_CUSPARSE_ONE ,barray,blda, 2244 &PETSC_CUSPARSE_ZERO,barray,blda, 2245 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2246 blda = B->cmap->n; 2247 k = B->cmap->n; 2248 } else { 2249 k = B->rmap->n; 2250 } 2251 2252 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2253 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2254 csrmat->num_entries,mat->alpha_one,mat->descr, 2255 csrmat->values->data().get(), 2256 csrmat->row_offsets->data().get(), 2257 csrmat->column_indices->data().get(), 2258 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2259 carray,clda);CHKERRCUSPARSE(stat); 2260 #endif 2261 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2262 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2263 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2264 if (product->type == MATPRODUCT_RARt) { 2265 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2266 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2267 } else if (product->type == MATPRODUCT_PtAP) { 2268 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2269 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2270 } else { 2271 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2272 } 2273 if (mmdata->cisdense) { 2274 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2275 } 2276 if (!biscuda) { 2277 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2278 } 2279 PetscFunctionReturn(0); 2280 } 2281 2282 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2283 { 2284 Mat_Product *product = C->product; 2285 Mat A,B; 2286 PetscInt m,n; 2287 PetscBool cisdense,flg; 2288 PetscErrorCode ierr; 2289 MatMatCusparse *mmdata; 2290 Mat_SeqAIJCUSPARSE *cusp; 2291 2292 PetscFunctionBegin; 2293 MatCheckProduct(C,1); 2294 PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2295 A = product->A; 2296 B = product->B; 2297 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2298 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2299 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2300 PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2301 switch (product->type) { 2302 case MATPRODUCT_AB: 2303 m = A->rmap->n; 2304 n = B->cmap->n; 2305 break; 2306 case MATPRODUCT_AtB: 2307 m = A->cmap->n; 2308 n = B->cmap->n; 2309 break; 2310 case MATPRODUCT_ABt: 2311 m = A->rmap->n; 2312 n = B->rmap->n; 2313 break; 2314 case MATPRODUCT_PtAP: 2315 m = B->cmap->n; 2316 n = B->cmap->n; 2317 break; 2318 case MATPRODUCT_RARt: 2319 m = B->rmap->n; 2320 n = B->rmap->n; 2321 break; 2322 default: 2323 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2324 } 2325 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2326 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2327 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2328 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2329 2330 /* product data */ 2331 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2332 mmdata->cisdense = cisdense; 2333 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2334 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2335 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2336 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2337 } 2338 #endif 2339 /* for these products we need intermediate storage */ 2340 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2341 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2342 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2343 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2344 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2345 } else { 2346 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2347 } 2348 } 2349 C->product->data = mmdata; 2350 C->product->destroy = MatDestroy_MatMatCusparse; 2351 2352 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2353 PetscFunctionReturn(0); 2354 } 2355 2356 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2357 { 2358 Mat_Product *product = C->product; 2359 Mat A,B; 2360 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2361 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2362 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2363 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2364 PetscBool flg; 2365 PetscErrorCode ierr; 2366 cusparseStatus_t stat; 2367 cudaError_t cerr; 2368 MatProductType ptype; 2369 MatMatCusparse *mmdata; 2370 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2371 cusparseSpMatDescr_t BmatSpDescr; 2372 #endif 2373 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2374 2375 PetscFunctionBegin; 2376 MatCheckProduct(C,1); 2377 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2378 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2379 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2380 mmdata = (MatMatCusparse*)C->product->data; 2381 A = product->A; 2382 B = product->B; 2383 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2384 mmdata->reusesym = PETSC_FALSE; 2385 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2386 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2387 Cmat = Ccusp->mat; 2388 PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2389 Ccsr = (CsrMatrix*)Cmat->mat; 2390 PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2391 goto finalize; 2392 } 2393 if (!c->nz) goto finalize; 2394 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2395 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2396 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2397 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2398 PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2399 PetscCheckFalse(B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2400 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2401 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2402 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2403 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2404 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2405 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2406 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2407 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2408 2409 ptype = product->type; 2410 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2411 ptype = MATPRODUCT_AB; 2412 PetscCheckFalse(!product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2413 } 2414 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2415 ptype = MATPRODUCT_AB; 2416 PetscCheckFalse(!product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2417 } 2418 switch (ptype) { 2419 case MATPRODUCT_AB: 2420 Amat = Acusp->mat; 2421 Bmat = Bcusp->mat; 2422 break; 2423 case MATPRODUCT_AtB: 2424 Amat = Acusp->matTranspose; 2425 Bmat = Bcusp->mat; 2426 break; 2427 case MATPRODUCT_ABt: 2428 Amat = Acusp->mat; 2429 Bmat = Bcusp->matTranspose; 2430 break; 2431 default: 2432 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2433 } 2434 Cmat = Ccusp->mat; 2435 PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2436 PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2437 PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2438 Acsr = (CsrMatrix*)Amat->mat; 2439 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2440 Ccsr = (CsrMatrix*)Cmat->mat; 2441 PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2442 PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2443 PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2444 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2445 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2446 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2447 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2448 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2449 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2450 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2451 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2452 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2453 #else 2454 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2455 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2456 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2457 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2458 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2459 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2460 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2461 #endif 2462 #else 2463 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2464 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2465 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2466 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2467 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2468 #endif 2469 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2470 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2471 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2472 C->offloadmask = PETSC_OFFLOAD_GPU; 2473 finalize: 2474 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2475 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2476 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2477 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 2478 c->reallocs = 0; 2479 C->info.mallocs += 0; 2480 C->info.nz_unneeded = 0; 2481 C->assembled = C->was_assembled = PETSC_TRUE; 2482 C->num_ass++; 2483 PetscFunctionReturn(0); 2484 } 2485 2486 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2487 { 2488 Mat_Product *product = C->product; 2489 Mat A,B; 2490 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2491 Mat_SeqAIJ *a,*b,*c; 2492 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2493 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2494 PetscInt i,j,m,n,k; 2495 PetscBool flg; 2496 PetscErrorCode ierr; 2497 cusparseStatus_t stat; 2498 cudaError_t cerr; 2499 MatProductType ptype; 2500 MatMatCusparse *mmdata; 2501 PetscLogDouble flops; 2502 PetscBool biscompressed,ciscompressed; 2503 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2504 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2505 cusparseSpMatDescr_t BmatSpDescr; 2506 #else 2507 int cnz; 2508 #endif 2509 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2510 2511 PetscFunctionBegin; 2512 MatCheckProduct(C,1); 2513 PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2514 A = product->A; 2515 B = product->B; 2516 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2517 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2518 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2519 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2520 a = (Mat_SeqAIJ*)A->data; 2521 b = (Mat_SeqAIJ*)B->data; 2522 /* product data */ 2523 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2524 C->product->data = mmdata; 2525 C->product->destroy = MatDestroy_MatMatCusparse; 2526 2527 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2528 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2529 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2530 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2531 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2532 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2533 2534 ptype = product->type; 2535 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2536 ptype = MATPRODUCT_AB; 2537 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2538 } 2539 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2540 ptype = MATPRODUCT_AB; 2541 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2542 } 2543 biscompressed = PETSC_FALSE; 2544 ciscompressed = PETSC_FALSE; 2545 switch (ptype) { 2546 case MATPRODUCT_AB: 2547 m = A->rmap->n; 2548 n = B->cmap->n; 2549 k = A->cmap->n; 2550 Amat = Acusp->mat; 2551 Bmat = Bcusp->mat; 2552 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2553 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2554 break; 2555 case MATPRODUCT_AtB: 2556 m = A->cmap->n; 2557 n = B->cmap->n; 2558 k = A->rmap->n; 2559 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2560 Amat = Acusp->matTranspose; 2561 Bmat = Bcusp->mat; 2562 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2563 break; 2564 case MATPRODUCT_ABt: 2565 m = A->rmap->n; 2566 n = B->rmap->n; 2567 k = A->cmap->n; 2568 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2569 Amat = Acusp->mat; 2570 Bmat = Bcusp->matTranspose; 2571 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2572 break; 2573 default: 2574 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2575 } 2576 2577 /* create cusparse matrix */ 2578 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2579 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2580 c = (Mat_SeqAIJ*)C->data; 2581 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2582 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2583 Ccsr = new CsrMatrix; 2584 2585 c->compressedrow.use = ciscompressed; 2586 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2587 c->compressedrow.nrows = a->compressedrow.nrows; 2588 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2589 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2590 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2591 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2592 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2593 } else { 2594 c->compressedrow.nrows = 0; 2595 c->compressedrow.i = NULL; 2596 c->compressedrow.rindex = NULL; 2597 Ccusp->workVector = NULL; 2598 Cmat->cprowIndices = NULL; 2599 } 2600 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2601 Ccusp->mat = Cmat; 2602 Ccusp->mat->mat = Ccsr; 2603 Ccsr->num_rows = Ccusp->nrows; 2604 Ccsr->num_cols = n; 2605 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2606 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2607 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2608 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2609 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2610 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2611 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2612 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2613 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2614 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2615 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2616 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2617 c->nz = 0; 2618 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2619 Ccsr->values = new THRUSTARRAY(c->nz); 2620 goto finalizesym; 2621 } 2622 2623 PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2624 PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2625 Acsr = (CsrMatrix*)Amat->mat; 2626 if (!biscompressed) { 2627 Bcsr = (CsrMatrix*)Bmat->mat; 2628 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2629 BmatSpDescr = Bmat->matDescr; 2630 #endif 2631 } else { /* we need to use row offsets for the full matrix */ 2632 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2633 Bcsr = new CsrMatrix; 2634 Bcsr->num_rows = B->rmap->n; 2635 Bcsr->num_cols = cBcsr->num_cols; 2636 Bcsr->num_entries = cBcsr->num_entries; 2637 Bcsr->column_indices = cBcsr->column_indices; 2638 Bcsr->values = cBcsr->values; 2639 if (!Bcusp->rowoffsets_gpu) { 2640 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2641 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2642 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2643 } 2644 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2645 mmdata->Bcsr = Bcsr; 2646 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2647 if (Bcsr->num_rows && Bcsr->num_cols) { 2648 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2649 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2650 Bcsr->values->data().get(), 2651 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2652 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2653 } 2654 BmatSpDescr = mmdata->matSpBDescr; 2655 #endif 2656 } 2657 PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2658 PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2659 /* precompute flops count */ 2660 if (ptype == MATPRODUCT_AB) { 2661 for (i=0, flops = 0; i<A->rmap->n; i++) { 2662 const PetscInt st = a->i[i]; 2663 const PetscInt en = a->i[i+1]; 2664 for (j=st; j<en; j++) { 2665 const PetscInt brow = a->j[j]; 2666 flops += 2.*(b->i[brow+1] - b->i[brow]); 2667 } 2668 } 2669 } else if (ptype == MATPRODUCT_AtB) { 2670 for (i=0, flops = 0; i<A->rmap->n; i++) { 2671 const PetscInt anzi = a->i[i+1] - a->i[i]; 2672 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2673 flops += (2.*anzi)*bnzi; 2674 } 2675 } else { /* TODO */ 2676 flops = 0.; 2677 } 2678 2679 mmdata->flops = flops; 2680 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2681 2682 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2683 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2684 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2685 NULL, NULL, NULL, 2686 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2687 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2688 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2689 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2690 { 2691 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2692 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2693 */ 2694 void* dBuffer1 = NULL; 2695 void* dBuffer2 = NULL; 2696 void* dBuffer3 = NULL; 2697 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2698 size_t bufferSize1 = 0; 2699 size_t bufferSize2 = 0; 2700 size_t bufferSize3 = 0; 2701 size_t bufferSize4 = 0; 2702 size_t bufferSize5 = 0; 2703 2704 /*----------------------------------------------------------------------*/ 2705 /* ask bufferSize1 bytes for external memory */ 2706 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2707 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2708 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2709 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2710 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2711 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2712 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2713 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2714 2715 /*----------------------------------------------------------------------*/ 2716 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2717 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2718 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2719 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2720 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2721 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2722 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2723 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2724 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2725 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2726 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2727 2728 /*----------------------------------------------------------------------*/ 2729 /* get matrix C non-zero entries C_nnz1 */ 2730 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2731 c->nz = (PetscInt) C_nnz1; 2732 /* allocate matrix C */ 2733 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2734 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2735 /* update matC with the new pointers */ 2736 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2737 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2738 2739 /*----------------------------------------------------------------------*/ 2740 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2741 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2742 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2743 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2744 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2745 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2746 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2747 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2748 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2749 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2750 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2751 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2752 ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2753 } 2754 #else 2755 size_t bufSize2; 2756 /* ask bufferSize bytes for external memory */ 2757 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2758 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2759 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2760 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2761 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2762 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2763 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2764 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2765 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2766 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2767 /* ask bufferSize again bytes for external memory */ 2768 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2769 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2770 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2771 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2772 /* The CUSPARSE documentation is not clear, nor the API 2773 We need both buffers to perform the operations properly! 2774 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2775 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2776 is stored in the descriptor! What a messy API... */ 2777 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2778 /* compute the intermediate product of A * B */ 2779 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2780 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2781 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2782 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2783 /* get matrix C non-zero entries C_nnz1 */ 2784 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2785 c->nz = (PetscInt) C_nnz1; 2786 ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2787 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2788 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2789 Ccsr->values = new THRUSTARRAY(c->nz); 2790 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2791 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2792 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2793 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2794 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2795 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2796 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2797 #else 2798 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2799 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2800 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2801 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2802 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2803 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2804 c->nz = cnz; 2805 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2806 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2807 Ccsr->values = new THRUSTARRAY(c->nz); 2808 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2809 2810 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2811 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2812 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2813 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2814 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2815 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2816 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2817 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2818 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2819 #endif 2820 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2821 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2822 finalizesym: 2823 c->singlemalloc = PETSC_FALSE; 2824 c->free_a = PETSC_TRUE; 2825 c->free_ij = PETSC_TRUE; 2826 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2827 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2828 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2829 PetscInt *d_i = c->i; 2830 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2831 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2832 ii = *Ccsr->row_offsets; 2833 jj = *Ccsr->column_indices; 2834 if (ciscompressed) d_i = c->compressedrow.i; 2835 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2836 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2837 } else { 2838 PetscInt *d_i = c->i; 2839 if (ciscompressed) d_i = c->compressedrow.i; 2840 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2841 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2842 } 2843 if (ciscompressed) { /* need to expand host row offsets */ 2844 PetscInt r = 0; 2845 c->i[0] = 0; 2846 for (k = 0; k < c->compressedrow.nrows; k++) { 2847 const PetscInt next = c->compressedrow.rindex[k]; 2848 const PetscInt old = c->compressedrow.i[k]; 2849 for (; r < next; r++) c->i[r+1] = old; 2850 } 2851 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2852 } 2853 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2854 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2855 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2856 c->maxnz = c->nz; 2857 c->nonzerorowcnt = 0; 2858 c->rmax = 0; 2859 for (k = 0; k < m; k++) { 2860 const PetscInt nn = c->i[k+1] - c->i[k]; 2861 c->ilen[k] = c->imax[k] = nn; 2862 c->nonzerorowcnt += (PetscInt)!!nn; 2863 c->rmax = PetscMax(c->rmax,nn); 2864 } 2865 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2866 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2867 Ccsr->num_entries = c->nz; 2868 2869 C->nonzerostate++; 2870 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2871 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2872 Ccusp->nonzerostate = C->nonzerostate; 2873 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2874 C->preallocated = PETSC_TRUE; 2875 C->assembled = PETSC_FALSE; 2876 C->was_assembled = PETSC_FALSE; 2877 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2878 mmdata->reusesym = PETSC_TRUE; 2879 C->offloadmask = PETSC_OFFLOAD_GPU; 2880 } 2881 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2882 PetscFunctionReturn(0); 2883 } 2884 2885 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2886 2887 /* handles sparse or dense B */ 2888 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2889 { 2890 Mat_Product *product = mat->product; 2891 PetscErrorCode ierr; 2892 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2893 2894 PetscFunctionBegin; 2895 MatCheckProduct(mat,1); 2896 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2897 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2898 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2899 } 2900 if (product->type == MATPRODUCT_ABC) { 2901 Ciscusp = PETSC_FALSE; 2902 if (!product->C->boundtocpu) { 2903 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2904 } 2905 } 2906 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2907 PetscBool usecpu = PETSC_FALSE; 2908 switch (product->type) { 2909 case MATPRODUCT_AB: 2910 if (product->api_user) { 2911 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2912 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2913 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2914 } else { 2915 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2916 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2917 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2918 } 2919 break; 2920 case MATPRODUCT_AtB: 2921 if (product->api_user) { 2922 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2923 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2924 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2925 } else { 2926 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2927 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2928 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2929 } 2930 break; 2931 case MATPRODUCT_PtAP: 2932 if (product->api_user) { 2933 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2934 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2935 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2936 } else { 2937 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2938 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2939 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2940 } 2941 break; 2942 case MATPRODUCT_RARt: 2943 if (product->api_user) { 2944 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2945 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2946 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2947 } else { 2948 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2949 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2950 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2951 } 2952 break; 2953 case MATPRODUCT_ABC: 2954 if (product->api_user) { 2955 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2956 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2957 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2958 } else { 2959 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2960 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2961 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2962 } 2963 break; 2964 default: 2965 break; 2966 } 2967 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2968 } 2969 /* dispatch */ 2970 if (isdense) { 2971 switch (product->type) { 2972 case MATPRODUCT_AB: 2973 case MATPRODUCT_AtB: 2974 case MATPRODUCT_ABt: 2975 case MATPRODUCT_PtAP: 2976 case MATPRODUCT_RARt: 2977 if (product->A->boundtocpu) { 2978 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2979 } else { 2980 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2981 } 2982 break; 2983 case MATPRODUCT_ABC: 2984 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2985 break; 2986 default: 2987 break; 2988 } 2989 } else if (Biscusp && Ciscusp) { 2990 switch (product->type) { 2991 case MATPRODUCT_AB: 2992 case MATPRODUCT_AtB: 2993 case MATPRODUCT_ABt: 2994 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2995 break; 2996 case MATPRODUCT_PtAP: 2997 case MATPRODUCT_RARt: 2998 case MATPRODUCT_ABC: 2999 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3000 break; 3001 default: 3002 break; 3003 } 3004 } else { /* fallback for AIJ */ 3005 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3006 } 3007 PetscFunctionReturn(0); 3008 } 3009 3010 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3011 { 3012 PetscErrorCode ierr; 3013 3014 PetscFunctionBegin; 3015 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3016 PetscFunctionReturn(0); 3017 } 3018 3019 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3020 { 3021 PetscErrorCode ierr; 3022 3023 PetscFunctionBegin; 3024 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3025 PetscFunctionReturn(0); 3026 } 3027 3028 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3029 { 3030 PetscErrorCode ierr; 3031 3032 PetscFunctionBegin; 3033 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3034 PetscFunctionReturn(0); 3035 } 3036 3037 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3038 { 3039 PetscErrorCode ierr; 3040 3041 PetscFunctionBegin; 3042 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3043 PetscFunctionReturn(0); 3044 } 3045 3046 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3047 { 3048 PetscErrorCode ierr; 3049 3050 PetscFunctionBegin; 3051 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3052 PetscFunctionReturn(0); 3053 } 3054 3055 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3056 { 3057 int i = blockIdx.x*blockDim.x + threadIdx.x; 3058 if (i < n) y[idx[i]] += x[i]; 3059 } 3060 3061 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3062 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3063 { 3064 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3065 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3066 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3067 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3068 PetscErrorCode ierr; 3069 cusparseStatus_t stat; 3070 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3071 PetscBool compressed; 3072 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3073 PetscInt nx,ny; 3074 #endif 3075 3076 PetscFunctionBegin; 3077 PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3078 if (!a->nz) { 3079 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3080 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3081 PetscFunctionReturn(0); 3082 } 3083 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3084 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3085 if (!trans) { 3086 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3087 PetscCheckFalse(!matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3088 } else { 3089 if (herm || !A->form_explicit_transpose) { 3090 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3091 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3092 } else { 3093 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3094 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3095 } 3096 } 3097 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3098 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3099 3100 try { 3101 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3102 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3103 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3104 3105 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3106 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3107 /* z = A x + beta y. 3108 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3109 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3110 */ 3111 xptr = xarray; 3112 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3113 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3114 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3115 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3116 allocated to accommodate different uses. So we get the length info directly from mat. 3117 */ 3118 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3119 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3120 nx = mat->num_cols; 3121 ny = mat->num_rows; 3122 } 3123 #endif 3124 } else { 3125 /* z = A^T x + beta y 3126 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3127 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3128 */ 3129 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3130 dptr = zarray; 3131 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3132 if (compressed) { /* Scatter x to work vector */ 3133 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3134 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3135 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3136 VecCUDAEqualsReverse()); 3137 } 3138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3139 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3140 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3141 nx = mat->num_rows; 3142 ny = mat->num_cols; 3143 } 3144 #endif 3145 } 3146 3147 /* csr_spmv does y = alpha op(A) x + beta y */ 3148 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3149 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3150 PetscCheckFalse(opA < 0 || opA > 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3151 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3152 cudaError_t cerr; 3153 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3154 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3155 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3156 matstruct->matDescr, 3157 matstruct->cuSpMV[opA].vecXDescr, beta, 3158 matstruct->cuSpMV[opA].vecYDescr, 3159 cusparse_scalartype, 3160 cusparsestruct->spmvAlg, 3161 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3162 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3163 3164 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3165 } else { 3166 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3167 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3168 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3169 } 3170 3171 stat = cusparseSpMV(cusparsestruct->handle, opA, 3172 matstruct->alpha_one, 3173 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3174 matstruct->cuSpMV[opA].vecXDescr, 3175 beta, 3176 matstruct->cuSpMV[opA].vecYDescr, 3177 cusparse_scalartype, 3178 cusparsestruct->spmvAlg, 3179 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3180 #else 3181 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3182 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3183 mat->num_rows, mat->num_cols, 3184 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3185 mat->values->data().get(), mat->row_offsets->data().get(), 3186 mat->column_indices->data().get(), xptr, beta, 3187 dptr);CHKERRCUSPARSE(stat); 3188 #endif 3189 } else { 3190 if (cusparsestruct->nrows) { 3191 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3192 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3193 #else 3194 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3195 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3196 matstruct->alpha_one, matstruct->descr, hybMat, 3197 xptr, beta, 3198 dptr);CHKERRCUSPARSE(stat); 3199 #endif 3200 } 3201 } 3202 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3203 3204 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3205 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3206 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3207 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3208 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3209 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3210 } 3211 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3212 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3213 } 3214 3215 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3216 if (compressed) { 3217 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3218 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3219 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3220 prevent that. So I just add a ScatterAdd kernel. 3221 */ 3222 #if 0 3223 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3224 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3225 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3226 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3227 VecCUDAPlusEquals()); 3228 #else 3229 PetscInt n = matstruct->cprowIndices->size(); 3230 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3231 #endif 3232 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3233 } 3234 } else { 3235 if (yy && yy != zz) { 3236 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3237 } 3238 } 3239 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3240 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3241 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3242 } catch(char *ex) { 3243 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3244 } 3245 if (yy) { 3246 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3247 } else { 3248 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3249 } 3250 PetscFunctionReturn(0); 3251 } 3252 3253 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3254 { 3255 PetscErrorCode ierr; 3256 3257 PetscFunctionBegin; 3258 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3259 PetscFunctionReturn(0); 3260 } 3261 3262 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3263 { 3264 PetscErrorCode ierr; 3265 PetscObjectState onnz = A->nonzerostate; 3266 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3267 3268 PetscFunctionBegin; 3269 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3270 if (onnz != A->nonzerostate && cusp->deviceMat) { 3271 cudaError_t cerr; 3272 3273 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3274 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3275 cusp->deviceMat = NULL; 3276 } 3277 PetscFunctionReturn(0); 3278 } 3279 3280 /* --------------------------------------------------------------------------------*/ 3281 /*@ 3282 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3283 (the default parallel PETSc format). This matrix will ultimately pushed down 3284 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3285 assembly performance the user should preallocate the matrix storage by setting 3286 the parameter nz (or the array nnz). By setting these parameters accurately, 3287 performance during matrix assembly can be increased by more than a factor of 50. 3288 3289 Collective 3290 3291 Input Parameters: 3292 + comm - MPI communicator, set to PETSC_COMM_SELF 3293 . m - number of rows 3294 . n - number of columns 3295 . nz - number of nonzeros per row (same for all rows) 3296 - nnz - array containing the number of nonzeros in the various rows 3297 (possibly different for each row) or NULL 3298 3299 Output Parameter: 3300 . A - the matrix 3301 3302 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3303 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3304 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3305 3306 Notes: 3307 If nnz is given then nz is ignored 3308 3309 The AIJ format (also called the Yale sparse matrix format or 3310 compressed row storage), is fully compatible with standard Fortran 77 3311 storage. That is, the stored row and column indices can begin at 3312 either one (as in Fortran) or zero. See the users' manual for details. 3313 3314 Specify the preallocated storage with either nz or nnz (not both). 3315 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3316 allocation. For large problems you MUST preallocate memory or you 3317 will get TERRIBLE performance, see the users' manual chapter on matrices. 3318 3319 By default, this format uses inodes (identical nodes) when possible, to 3320 improve numerical efficiency of matrix-vector products and solves. We 3321 search for consecutive rows with the same nonzero structure, thereby 3322 reusing matrix information to achieve increased efficiency. 3323 3324 Level: intermediate 3325 3326 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3327 @*/ 3328 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3329 { 3330 PetscErrorCode ierr; 3331 3332 PetscFunctionBegin; 3333 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3334 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3335 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3336 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3337 PetscFunctionReturn(0); 3338 } 3339 3340 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3341 { 3342 PetscErrorCode ierr; 3343 3344 PetscFunctionBegin; 3345 if (A->factortype == MAT_FACTOR_NONE) { 3346 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3347 } else { 3348 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3349 } 3350 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3351 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3357 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3358 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3359 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3360 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3361 PetscFunctionReturn(0); 3362 } 3363 3364 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3365 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3366 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3367 { 3368 PetscErrorCode ierr; 3369 3370 PetscFunctionBegin; 3371 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3372 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3373 PetscFunctionReturn(0); 3374 } 3375 3376 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3377 { 3378 PetscErrorCode ierr; 3379 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3380 Mat_SeqAIJCUSPARSE *cy; 3381 Mat_SeqAIJCUSPARSE *cx; 3382 PetscScalar *ay; 3383 const PetscScalar *ax; 3384 CsrMatrix *csry,*csrx; 3385 3386 PetscFunctionBegin; 3387 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3388 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3389 if (X->ops->axpy != Y->ops->axpy) { 3390 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3391 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3392 PetscFunctionReturn(0); 3393 } 3394 /* if we are here, it means both matrices are bound to GPU */ 3395 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3396 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3397 PetscCheckFalse(cy->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3398 PetscCheckFalse(cx->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3399 csry = (CsrMatrix*)cy->mat->mat; 3400 csrx = (CsrMatrix*)cx->mat->mat; 3401 /* see if we can turn this into a cublas axpy */ 3402 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3403 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3404 if (eq) { 3405 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3406 } 3407 if (eq) str = SAME_NONZERO_PATTERN; 3408 } 3409 /* spgeam is buggy with one column */ 3410 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3411 3412 if (str == SUBSET_NONZERO_PATTERN) { 3413 cusparseStatus_t stat; 3414 PetscScalar b = 1.0; 3415 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3416 size_t bufferSize; 3417 void *buffer; 3418 cudaError_t cerr; 3419 #endif 3420 3421 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3422 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3423 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3424 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3425 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3426 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3427 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3428 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3429 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3430 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3431 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3432 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3433 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3434 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3435 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3436 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3437 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3438 #else 3439 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3440 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3441 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3442 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3443 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3444 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3445 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3446 #endif 3447 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3448 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3449 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3450 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3451 } else if (str == SAME_NONZERO_PATTERN) { 3452 cublasHandle_t cublasv2handle; 3453 cublasStatus_t berr; 3454 PetscBLASInt one = 1, bnz = 1; 3455 3456 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3457 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3458 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3459 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3460 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3461 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3462 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3463 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3464 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3465 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3466 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3467 } else { 3468 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3469 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3470 } 3471 PetscFunctionReturn(0); 3472 } 3473 3474 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3475 { 3476 PetscErrorCode ierr; 3477 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3478 PetscScalar *ay; 3479 cublasHandle_t cublasv2handle; 3480 cublasStatus_t berr; 3481 PetscBLASInt one = 1, bnz = 1; 3482 3483 PetscFunctionBegin; 3484 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3485 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3486 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3487 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3488 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3489 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3490 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3491 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3492 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3493 PetscFunctionReturn(0); 3494 } 3495 3496 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3497 { 3498 PetscErrorCode ierr; 3499 PetscBool both = PETSC_FALSE; 3500 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3501 3502 PetscFunctionBegin; 3503 if (A->factortype == MAT_FACTOR_NONE) { 3504 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3505 if (spptr->mat) { 3506 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3507 if (matrix->values) { 3508 both = PETSC_TRUE; 3509 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3510 } 3511 } 3512 if (spptr->matTranspose) { 3513 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3514 if (matrix->values) { 3515 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3516 } 3517 } 3518 } 3519 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3520 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3521 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3522 else A->offloadmask = PETSC_OFFLOAD_CPU; 3523 PetscFunctionReturn(0); 3524 } 3525 3526 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3527 { 3528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3529 PetscErrorCode ierr; 3530 3531 PetscFunctionBegin; 3532 if (A->factortype != MAT_FACTOR_NONE) { 3533 A->boundtocpu = flg; 3534 PetscFunctionReturn(0); 3535 } 3536 if (flg) { 3537 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3538 3539 A->ops->scale = MatScale_SeqAIJ; 3540 A->ops->axpy = MatAXPY_SeqAIJ; 3541 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3542 A->ops->mult = MatMult_SeqAIJ; 3543 A->ops->multadd = MatMultAdd_SeqAIJ; 3544 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3545 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3546 A->ops->multhermitiantranspose = NULL; 3547 A->ops->multhermitiantransposeadd = NULL; 3548 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3549 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3553 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3554 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3555 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3556 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3557 } else { 3558 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3559 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3560 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3561 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3562 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3563 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3564 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3565 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3566 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3567 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3568 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3569 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3570 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3571 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3572 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3573 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3574 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3575 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3576 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3577 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3579 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3580 } 3581 A->boundtocpu = flg; 3582 if (flg && a->inode.size) { 3583 a->inode.use = PETSC_TRUE; 3584 } else { 3585 a->inode.use = PETSC_FALSE; 3586 } 3587 PetscFunctionReturn(0); 3588 } 3589 3590 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3591 { 3592 PetscErrorCode ierr; 3593 cusparseStatus_t stat; 3594 Mat B; 3595 3596 PetscFunctionBegin; 3597 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3598 if (reuse == MAT_INITIAL_MATRIX) { 3599 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3600 } else if (reuse == MAT_REUSE_MATRIX) { 3601 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3602 } 3603 B = *newmat; 3604 3605 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3606 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3607 3608 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3609 if (B->factortype == MAT_FACTOR_NONE) { 3610 Mat_SeqAIJCUSPARSE *spptr; 3611 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3612 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3613 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3614 spptr->format = MAT_CUSPARSE_CSR; 3615 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3616 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3617 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3618 #else 3619 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3620 #endif 3621 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3622 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3623 #endif 3624 B->spptr = spptr; 3625 } else { 3626 Mat_SeqAIJCUSPARSETriFactors *spptr; 3627 3628 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3629 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3630 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3631 B->spptr = spptr; 3632 } 3633 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3634 } 3635 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3636 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3637 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3638 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3639 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3640 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3641 3642 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3643 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3644 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3645 #if defined(PETSC_HAVE_HYPRE) 3646 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3647 #endif 3648 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 3649 PetscFunctionReturn(0); 3650 } 3651 3652 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3653 { 3654 PetscErrorCode ierr; 3655 3656 PetscFunctionBegin; 3657 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3658 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3659 PetscFunctionReturn(0); 3660 } 3661 3662 /*MC 3663 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3664 3665 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3666 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3667 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3668 3669 Options Database Keys: 3670 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3671 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3672 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3673 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3674 3675 Level: beginner 3676 3677 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3678 M*/ 3679 3680 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3681 3682 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3683 { 3684 PetscErrorCode ierr; 3685 3686 PetscFunctionBegin; 3687 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3688 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3689 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3690 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3691 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3692 3693 PetscFunctionReturn(0); 3694 } 3695 3696 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3697 { 3698 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3699 cudaError_t cerr; 3700 3701 PetscFunctionBegin; 3702 if (!cusp) PetscFunctionReturn(0); 3703 delete cusp->cooPerm; 3704 delete cusp->cooPerm_a; 3705 cusp->cooPerm = NULL; 3706 cusp->cooPerm_a = NULL; 3707 if (cusp->use_extended_coo) { 3708 cerr = cudaFree(cusp->jmap_d);CHKERRCUDA(cerr); 3709 cerr = cudaFree(cusp->perm_d);CHKERRCUDA(cerr); 3710 } 3711 cusp->use_extended_coo = PETSC_FALSE; 3712 PetscFunctionReturn(0); 3713 } 3714 3715 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3716 { 3717 PetscErrorCode ierr; 3718 cusparseStatus_t stat; 3719 cudaError_t cerr; 3720 3721 PetscFunctionBegin; 3722 if (*cusparsestruct) { 3723 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3724 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3725 delete (*cusparsestruct)->workVector; 3726 delete (*cusparsestruct)->rowoffsets_gpu; 3727 delete (*cusparsestruct)->cooPerm; 3728 delete (*cusparsestruct)->cooPerm_a; 3729 delete (*cusparsestruct)->csr2csc_i; 3730 if ((*cusparsestruct)->handle) { stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat); } 3731 if ((*cusparsestruct)->jmap_d) { cerr = cudaFree((*cusparsestruct)->jmap_d);CHKERRCUDA(cerr); } 3732 if ((*cusparsestruct)->perm_d) { cerr = cudaFree((*cusparsestruct)->perm_d);CHKERRCUDA(cerr); } 3733 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3734 } 3735 PetscFunctionReturn(0); 3736 } 3737 3738 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3739 { 3740 PetscFunctionBegin; 3741 if (*mat) { 3742 delete (*mat)->values; 3743 delete (*mat)->column_indices; 3744 delete (*mat)->row_offsets; 3745 delete *mat; 3746 *mat = 0; 3747 } 3748 PetscFunctionReturn(0); 3749 } 3750 3751 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3752 { 3753 cusparseStatus_t stat; 3754 PetscErrorCode ierr; 3755 3756 PetscFunctionBegin; 3757 if (*trifactor) { 3758 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3759 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3760 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3761 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3762 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3763 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3764 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3765 #endif 3766 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3767 } 3768 PetscFunctionReturn(0); 3769 } 3770 3771 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3772 { 3773 CsrMatrix *mat; 3774 cusparseStatus_t stat; 3775 cudaError_t err; 3776 3777 PetscFunctionBegin; 3778 if (*matstruct) { 3779 if ((*matstruct)->mat) { 3780 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3781 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3782 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3783 #else 3784 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3785 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3786 #endif 3787 } else { 3788 mat = (CsrMatrix*)(*matstruct)->mat; 3789 CsrMatrix_Destroy(&mat); 3790 } 3791 } 3792 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3793 delete (*matstruct)->cprowIndices; 3794 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3795 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3796 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3797 3798 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3799 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3800 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3801 for (int i=0; i<3; i++) { 3802 if (mdata->cuSpMV[i].initialized) { 3803 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3804 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3805 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3806 } 3807 } 3808 #endif 3809 delete *matstruct; 3810 *matstruct = NULL; 3811 } 3812 PetscFunctionReturn(0); 3813 } 3814 3815 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3816 { 3817 PetscErrorCode ierr; 3818 3819 PetscFunctionBegin; 3820 if (*trifactors) { 3821 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3822 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3823 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3824 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3825 delete (*trifactors)->rpermIndices; 3826 delete (*trifactors)->cpermIndices; 3827 delete (*trifactors)->workVector; 3828 (*trifactors)->rpermIndices = NULL; 3829 (*trifactors)->cpermIndices = NULL; 3830 (*trifactors)->workVector = NULL; 3831 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3832 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3833 (*trifactors)->init_dev_prop = PETSC_FALSE; 3834 } 3835 PetscFunctionReturn(0); 3836 } 3837 3838 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3839 { 3840 PetscErrorCode ierr; 3841 cusparseHandle_t handle; 3842 cusparseStatus_t stat; 3843 3844 PetscFunctionBegin; 3845 if (*trifactors) { 3846 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3847 if (handle = (*trifactors)->handle) { 3848 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3849 } 3850 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3851 } 3852 PetscFunctionReturn(0); 3853 } 3854 3855 struct IJCompare 3856 { 3857 __host__ __device__ 3858 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3859 { 3860 if (t1.get<0>() < t2.get<0>()) return true; 3861 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3862 return false; 3863 } 3864 }; 3865 3866 struct IJEqual 3867 { 3868 __host__ __device__ 3869 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3870 { 3871 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3872 return true; 3873 } 3874 }; 3875 3876 struct IJDiff 3877 { 3878 __host__ __device__ 3879 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3880 { 3881 return t1 == t2 ? 0 : 1; 3882 } 3883 }; 3884 3885 struct IJSum 3886 { 3887 __host__ __device__ 3888 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3889 { 3890 return t1||t2; 3891 } 3892 }; 3893 3894 #include <thrust/iterator/discard_iterator.h> 3895 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3896 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3897 { 3898 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3899 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3900 THRUSTARRAY *cooPerm_v = NULL; 3901 thrust::device_ptr<const PetscScalar> d_v; 3902 CsrMatrix *matrix; 3903 PetscErrorCode ierr; 3904 PetscInt n; 3905 3906 PetscFunctionBegin; 3907 PetscCheckFalse(!cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3908 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3909 if (!cusp->cooPerm) { 3910 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3911 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3912 PetscFunctionReturn(0); 3913 } 3914 matrix = (CsrMatrix*)cusp->mat->mat; 3915 PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3916 if (!v) { 3917 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3918 goto finalize; 3919 } 3920 n = cusp->cooPerm->size(); 3921 if (isCudaMem(v)) { 3922 d_v = thrust::device_pointer_cast(v); 3923 } else { 3924 cooPerm_v = new THRUSTARRAY(n); 3925 cooPerm_v->assign(v,v+n); 3926 d_v = cooPerm_v->data(); 3927 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3928 } 3929 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3930 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3931 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3932 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3933 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3934 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3935 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3936 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3937 */ 3938 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3939 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3940 delete cooPerm_w; 3941 } else { 3942 /* all nonzeros in d_v[] are unique entries */ 3943 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3944 matrix->values->begin())); 3945 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3946 matrix->values->end())); 3947 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3948 } 3949 } else { 3950 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3951 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3952 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3953 } else { 3954 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3955 matrix->values->begin())); 3956 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3957 matrix->values->end())); 3958 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3959 } 3960 } 3961 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3962 finalize: 3963 delete cooPerm_v; 3964 A->offloadmask = PETSC_OFFLOAD_GPU; 3965 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3966 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3967 ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3968 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3969 ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr); 3970 a->reallocs = 0; 3971 A->info.mallocs += 0; 3972 A->info.nz_unneeded = 0; 3973 A->assembled = A->was_assembled = PETSC_TRUE; 3974 A->num_ass++; 3975 PetscFunctionReturn(0); 3976 } 3977 3978 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3979 { 3980 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3981 PetscErrorCode ierr; 3982 3983 PetscFunctionBegin; 3984 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3985 if (!cusp) PetscFunctionReturn(0); 3986 if (destroy) { 3987 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3988 delete cusp->csr2csc_i; 3989 cusp->csr2csc_i = NULL; 3990 } 3991 A->transupdated = PETSC_FALSE; 3992 PetscFunctionReturn(0); 3993 } 3994 3995 #include <thrust/binary_search.h> 3996 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3997 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3998 { 3999 PetscErrorCode ierr; 4000 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4001 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4002 PetscInt cooPerm_n, nzr = 0; 4003 cudaError_t cerr; 4004 4005 PetscFunctionBegin; 4006 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 4007 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 4008 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 4009 if (n != cooPerm_n) { 4010 delete cusp->cooPerm; 4011 delete cusp->cooPerm_a; 4012 cusp->cooPerm = NULL; 4013 cusp->cooPerm_a = NULL; 4014 } 4015 if (n) { 4016 THRUSTINTARRAY d_i(n); 4017 THRUSTINTARRAY d_j(n); 4018 THRUSTINTARRAY ii(A->rmap->n); 4019 4020 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 4021 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 4022 4023 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 4024 d_i.assign(coo_i,coo_i+n); 4025 d_j.assign(coo_j,coo_j+n); 4026 4027 /* Ex. 4028 n = 6 4029 coo_i = [3,3,1,4,1,4] 4030 coo_j = [3,2,2,5,2,6] 4031 */ 4032 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4033 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4034 4035 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4036 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4037 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4038 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4039 THRUSTINTARRAY w = d_j; 4040 4041 /* 4042 d_i = [1,1,3,3,4,4] 4043 d_j = [2,2,2,3,5,6] 4044 cooPerm = [2,4,1,0,3,5] 4045 */ 4046 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4047 4048 /* 4049 d_i = [1,3,3,4,4,x] 4050 ^ekey 4051 d_j = [2,2,3,5,6,x] 4052 ^nekye 4053 */ 4054 if (nekey == ekey) { /* all entries are unique */ 4055 delete cusp->cooPerm_a; 4056 cusp->cooPerm_a = NULL; 4057 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4058 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4059 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4060 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4061 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4062 w[0] = 0; 4063 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4064 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4065 } 4066 thrust::counting_iterator<PetscInt> search_begin(0); 4067 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4068 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4069 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4070 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4071 4072 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 4073 a->singlemalloc = PETSC_FALSE; 4074 a->free_a = PETSC_TRUE; 4075 a->free_ij = PETSC_TRUE; 4076 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4077 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4078 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4079 a->nz = a->maxnz = a->i[A->rmap->n]; 4080 a->rmax = 0; 4081 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4082 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4083 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4084 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4085 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4086 for (PetscInt i = 0; i < A->rmap->n; i++) { 4087 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4088 nzr += (PetscInt)!!(nnzr); 4089 a->ilen[i] = a->imax[i] = nnzr; 4090 a->rmax = PetscMax(a->rmax,nnzr); 4091 } 4092 a->nonzerorowcnt = nzr; 4093 A->preallocated = PETSC_TRUE; 4094 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4095 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4096 } else { 4097 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4098 } 4099 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4100 4101 /* We want to allocate the CUSPARSE struct for matvec now. 4102 The code is so convoluted now that I prefer to copy zeros */ 4103 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4104 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4105 A->offloadmask = PETSC_OFFLOAD_CPU; 4106 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4107 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4108 PetscFunctionReturn(0); 4109 } 4110 4111 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4112 { 4113 PetscErrorCode ierr; 4114 cudaError_t cerr; 4115 Mat_SeqAIJ *seq; 4116 Mat_SeqAIJCUSPARSE *dev; 4117 PetscBool coo_basic = PETSC_TRUE; 4118 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4119 4120 PetscFunctionBegin; 4121 ierr = MatResetPreallocationCOO_SeqAIJ(mat);CHKERRQ(ierr); 4122 ierr = MatResetPreallocationCOO_SeqAIJCUSPARSE(mat);CHKERRQ(ierr); 4123 if (coo_i) { 4124 ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 4125 if (PetscMemTypeHost(mtype)) { 4126 for (PetscCount k=0; k<coo_n; k++) { 4127 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4128 } 4129 } 4130 } 4131 4132 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4133 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 4134 } else { 4135 ierr = MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 4136 mat->offloadmask = PETSC_OFFLOAD_CPU; 4137 ierr = MatSeqAIJCUSPARSECopyToGPU(mat);CHKERRQ(ierr); 4138 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4139 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4140 cerr = cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 4141 cerr = cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4142 cerr = cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount));CHKERRCUDA(cerr); 4143 cerr = cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4144 dev->use_extended_coo = PETSC_TRUE; 4145 } 4146 PetscFunctionReturn(0); 4147 } 4148 4149 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4150 { 4151 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4152 const PetscCount grid_size = gridDim.x * blockDim.x; 4153 for (; i<nnz; i+= grid_size) { 4154 PetscScalar sum = 0.0; 4155 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4156 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4157 } 4158 } 4159 4160 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4161 { 4162 PetscErrorCode ierr; 4163 cudaError_t cerr; 4164 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4165 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4166 PetscCount Annz = seq->nz; 4167 PetscMemType memtype; 4168 const PetscScalar *v1 = v; 4169 PetscScalar *Aa; 4170 4171 PetscFunctionBegin; 4172 if (dev->use_extended_coo) { 4173 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 4174 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4175 cerr = cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 4176 cerr = cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4177 } 4178 4179 if (imode == INSERT_VALUES) {ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr);} 4180 else {ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr);} 4181 4182 if (Annz) { 4183 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4184 CHKERRCUDA(cudaPeekAtLastError()); 4185 } 4186 4187 if (imode == INSERT_VALUES) {ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);} 4188 else {ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);} 4189 4190 if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 4191 } else { 4192 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode);CHKERRQ(ierr); 4193 } 4194 PetscFunctionReturn(0); 4195 } 4196 4197 /*@C 4198 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4199 4200 Not collective 4201 4202 Input Parameters: 4203 + A - the matrix 4204 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4205 4206 Output Parameters: 4207 + ia - the CSR row pointers 4208 - ja - the CSR column indices 4209 4210 Level: developer 4211 4212 Notes: 4213 When compressed is true, the CSR structure does not contain empty rows 4214 4215 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4216 @*/ 4217 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4218 { 4219 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4220 CsrMatrix *csr; 4221 PetscErrorCode ierr; 4222 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4223 4224 PetscFunctionBegin; 4225 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4226 if (!i || !j) PetscFunctionReturn(0); 4227 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4228 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4229 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4230 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4231 csr = (CsrMatrix*)cusp->mat->mat; 4232 if (i) { 4233 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4234 if (!cusp->rowoffsets_gpu) { 4235 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4236 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4237 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4238 } 4239 *i = cusp->rowoffsets_gpu->data().get(); 4240 } else *i = csr->row_offsets->data().get(); 4241 } 4242 if (j) *j = csr->column_indices->data().get(); 4243 PetscFunctionReturn(0); 4244 } 4245 4246 /*@C 4247 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4248 4249 Not collective 4250 4251 Input Parameters: 4252 + A - the matrix 4253 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4254 4255 Output Parameters: 4256 + ia - the CSR row pointers 4257 - ja - the CSR column indices 4258 4259 Level: developer 4260 4261 .seealso: MatSeqAIJCUSPARSEGetIJ() 4262 @*/ 4263 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4264 { 4265 PetscFunctionBegin; 4266 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4267 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4268 if (i) *i = NULL; 4269 if (j) *j = NULL; 4270 PetscFunctionReturn(0); 4271 } 4272 4273 /*@C 4274 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4275 4276 Not Collective 4277 4278 Input Parameter: 4279 . A - a MATSEQAIJCUSPARSE matrix 4280 4281 Output Parameter: 4282 . a - pointer to the device data 4283 4284 Level: developer 4285 4286 Notes: may trigger host-device copies if up-to-date matrix data is on host 4287 4288 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4289 @*/ 4290 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4291 { 4292 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4293 CsrMatrix *csr; 4294 PetscErrorCode ierr; 4295 4296 PetscFunctionBegin; 4297 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4298 PetscValidPointer(a,2); 4299 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4300 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4301 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4302 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4303 csr = (CsrMatrix*)cusp->mat->mat; 4304 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4305 *a = csr->values->data().get(); 4306 PetscFunctionReturn(0); 4307 } 4308 4309 /*@C 4310 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4311 4312 Not Collective 4313 4314 Input Parameter: 4315 . A - a MATSEQAIJCUSPARSE matrix 4316 4317 Output Parameter: 4318 . a - pointer to the device data 4319 4320 Level: developer 4321 4322 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4323 @*/ 4324 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4325 { 4326 PetscFunctionBegin; 4327 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4328 PetscValidPointer(a,2); 4329 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4330 *a = NULL; 4331 PetscFunctionReturn(0); 4332 } 4333 4334 /*@C 4335 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4336 4337 Not Collective 4338 4339 Input Parameter: 4340 . A - a MATSEQAIJCUSPARSE matrix 4341 4342 Output Parameter: 4343 . a - pointer to the device data 4344 4345 Level: developer 4346 4347 Notes: may trigger host-device copies if up-to-date matrix data is on host 4348 4349 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4350 @*/ 4351 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4352 { 4353 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4354 CsrMatrix *csr; 4355 PetscErrorCode ierr; 4356 4357 PetscFunctionBegin; 4358 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4359 PetscValidPointer(a,2); 4360 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4361 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4362 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4363 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4364 csr = (CsrMatrix*)cusp->mat->mat; 4365 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4366 *a = csr->values->data().get(); 4367 A->offloadmask = PETSC_OFFLOAD_GPU; 4368 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4369 PetscFunctionReturn(0); 4370 } 4371 /*@C 4372 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4373 4374 Not Collective 4375 4376 Input Parameter: 4377 . A - a MATSEQAIJCUSPARSE matrix 4378 4379 Output Parameter: 4380 . a - pointer to the device data 4381 4382 Level: developer 4383 4384 .seealso: MatSeqAIJCUSPARSEGetArray() 4385 @*/ 4386 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4387 { 4388 PetscErrorCode ierr; 4389 4390 PetscFunctionBegin; 4391 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4392 PetscValidPointer(a,2); 4393 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4394 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 4395 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4396 *a = NULL; 4397 PetscFunctionReturn(0); 4398 } 4399 4400 /*@C 4401 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4402 4403 Not Collective 4404 4405 Input Parameter: 4406 . A - a MATSEQAIJCUSPARSE matrix 4407 4408 Output Parameter: 4409 . a - pointer to the device data 4410 4411 Level: developer 4412 4413 Notes: does not trigger host-device copies and flags data validity on the GPU 4414 4415 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4416 @*/ 4417 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4418 { 4419 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4420 CsrMatrix *csr; 4421 PetscErrorCode ierr; 4422 4423 PetscFunctionBegin; 4424 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4425 PetscValidPointer(a,2); 4426 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4427 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4428 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4429 csr = (CsrMatrix*)cusp->mat->mat; 4430 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4431 *a = csr->values->data().get(); 4432 A->offloadmask = PETSC_OFFLOAD_GPU; 4433 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4434 PetscFunctionReturn(0); 4435 } 4436 4437 /*@C 4438 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4439 4440 Not Collective 4441 4442 Input Parameter: 4443 . A - a MATSEQAIJCUSPARSE matrix 4444 4445 Output Parameter: 4446 . a - pointer to the device data 4447 4448 Level: developer 4449 4450 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4451 @*/ 4452 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4453 { 4454 PetscErrorCode ierr; 4455 4456 PetscFunctionBegin; 4457 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4458 PetscValidPointer(a,2); 4459 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4460 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 4461 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4462 *a = NULL; 4463 PetscFunctionReturn(0); 4464 } 4465 4466 struct IJCompare4 4467 { 4468 __host__ __device__ 4469 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4470 { 4471 if (t1.get<0>() < t2.get<0>()) return true; 4472 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4473 return false; 4474 } 4475 }; 4476 4477 struct Shift 4478 { 4479 int _shift; 4480 4481 Shift(int shift) : _shift(shift) {} 4482 __host__ __device__ 4483 inline int operator() (const int &c) 4484 { 4485 return c + _shift; 4486 } 4487 }; 4488 4489 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4490 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4491 { 4492 PetscErrorCode ierr; 4493 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4494 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4495 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4496 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4497 PetscInt Annz,Bnnz; 4498 cusparseStatus_t stat; 4499 PetscInt i,m,n,zero = 0; 4500 cudaError_t cerr; 4501 4502 PetscFunctionBegin; 4503 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4504 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4505 PetscValidPointer(C,4); 4506 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4507 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4508 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4509 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4510 PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4511 PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4512 if (reuse == MAT_INITIAL_MATRIX) { 4513 m = A->rmap->n; 4514 n = A->cmap->n + B->cmap->n; 4515 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4516 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4517 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4518 c = (Mat_SeqAIJ*)(*C)->data; 4519 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4520 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4521 Ccsr = new CsrMatrix; 4522 Cmat->cprowIndices = NULL; 4523 c->compressedrow.use = PETSC_FALSE; 4524 c->compressedrow.nrows = 0; 4525 c->compressedrow.i = NULL; 4526 c->compressedrow.rindex = NULL; 4527 Ccusp->workVector = NULL; 4528 Ccusp->nrows = m; 4529 Ccusp->mat = Cmat; 4530 Ccusp->mat->mat = Ccsr; 4531 Ccsr->num_rows = m; 4532 Ccsr->num_cols = n; 4533 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4534 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4535 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4536 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4537 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4538 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4539 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4540 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4541 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4542 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4543 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4544 PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4545 PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4546 4547 Acsr = (CsrMatrix*)Acusp->mat->mat; 4548 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4549 Annz = (PetscInt)Acsr->column_indices->size(); 4550 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4551 c->nz = Annz + Bnnz; 4552 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4553 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4554 Ccsr->values = new THRUSTARRAY(c->nz); 4555 Ccsr->num_entries = c->nz; 4556 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4557 if (c->nz) { 4558 auto Acoo = new THRUSTINTARRAY32(Annz); 4559 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4560 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4561 THRUSTINTARRAY32 *Aroff,*Broff; 4562 4563 if (a->compressedrow.use) { /* need full row offset */ 4564 if (!Acusp->rowoffsets_gpu) { 4565 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4566 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4567 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4568 } 4569 Aroff = Acusp->rowoffsets_gpu; 4570 } else Aroff = Acsr->row_offsets; 4571 if (b->compressedrow.use) { /* need full row offset */ 4572 if (!Bcusp->rowoffsets_gpu) { 4573 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4574 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4575 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4576 } 4577 Broff = Bcusp->rowoffsets_gpu; 4578 } else Broff = Bcsr->row_offsets; 4579 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4580 stat = cusparseXcsr2coo(Acusp->handle, 4581 Aroff->data().get(), 4582 Annz, 4583 m, 4584 Acoo->data().get(), 4585 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4586 stat = cusparseXcsr2coo(Bcusp->handle, 4587 Broff->data().get(), 4588 Bnnz, 4589 m, 4590 Bcoo->data().get(), 4591 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4592 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4593 auto Aperm = thrust::make_constant_iterator(1); 4594 auto Bperm = thrust::make_constant_iterator(0); 4595 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4596 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4597 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4598 #else 4599 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4600 auto Bcib = Bcsr->column_indices->begin(); 4601 auto Bcie = Bcsr->column_indices->end(); 4602 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4603 #endif 4604 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4605 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4606 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4607 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4608 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4609 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4610 auto p1 = Ccusp->cooPerm->begin(); 4611 auto p2 = Ccusp->cooPerm->begin(); 4612 thrust::advance(p2,Annz); 4613 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4614 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4615 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4616 #endif 4617 auto cci = thrust::make_counting_iterator(zero); 4618 auto cce = thrust::make_counting_iterator(c->nz); 4619 #if 0 //Errors on SUMMIT cuda 11.1.0 4620 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4621 #else 4622 auto pred = thrust::identity<int>(); 4623 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4624 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4625 #endif 4626 stat = cusparseXcoo2csr(Ccusp->handle, 4627 Ccoo->data().get(), 4628 c->nz, 4629 m, 4630 Ccsr->row_offsets->data().get(), 4631 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4632 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4633 delete wPerm; 4634 delete Acoo; 4635 delete Bcoo; 4636 delete Ccoo; 4637 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4638 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4639 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4640 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4641 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4642 #endif 4643 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4644 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4645 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4646 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4647 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4648 CsrMatrix *CcsrT = new CsrMatrix; 4649 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4650 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4651 4652 (*C)->form_explicit_transpose = PETSC_TRUE; 4653 (*C)->transupdated = PETSC_TRUE; 4654 Ccusp->rowoffsets_gpu = NULL; 4655 CmatT->cprowIndices = NULL; 4656 CmatT->mat = CcsrT; 4657 CcsrT->num_rows = n; 4658 CcsrT->num_cols = m; 4659 CcsrT->num_entries = c->nz; 4660 4661 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4662 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4663 CcsrT->values = new THRUSTARRAY(c->nz); 4664 4665 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4666 auto rT = CcsrT->row_offsets->begin(); 4667 if (AT) { 4668 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4669 thrust::advance(rT,-1); 4670 } 4671 if (BT) { 4672 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4673 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4674 thrust::copy(titb,tite,rT); 4675 } 4676 auto cT = CcsrT->column_indices->begin(); 4677 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4678 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4679 auto vT = CcsrT->values->begin(); 4680 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4681 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4682 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4683 4684 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4685 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4686 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4687 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4688 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4689 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4690 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4691 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4692 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4693 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4694 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4695 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4696 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4697 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4698 #endif 4699 Ccusp->matTranspose = CmatT; 4700 } 4701 } 4702 4703 c->singlemalloc = PETSC_FALSE; 4704 c->free_a = PETSC_TRUE; 4705 c->free_ij = PETSC_TRUE; 4706 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4707 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4708 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4709 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4710 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4711 ii = *Ccsr->row_offsets; 4712 jj = *Ccsr->column_indices; 4713 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4714 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4715 } else { 4716 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4717 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4718 } 4719 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4720 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4721 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4722 c->maxnz = c->nz; 4723 c->nonzerorowcnt = 0; 4724 c->rmax = 0; 4725 for (i = 0; i < m; i++) { 4726 const PetscInt nn = c->i[i+1] - c->i[i]; 4727 c->ilen[i] = c->imax[i] = nn; 4728 c->nonzerorowcnt += (PetscInt)!!nn; 4729 c->rmax = PetscMax(c->rmax,nn); 4730 } 4731 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4732 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4733 (*C)->nonzerostate++; 4734 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4735 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4736 Ccusp->nonzerostate = (*C)->nonzerostate; 4737 (*C)->preallocated = PETSC_TRUE; 4738 } else { 4739 PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4740 c = (Mat_SeqAIJ*)(*C)->data; 4741 if (c->nz) { 4742 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4743 PetscCheckFalse(!Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4744 PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4745 PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4746 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4747 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4748 PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4749 PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4750 Acsr = (CsrMatrix*)Acusp->mat->mat; 4751 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4752 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4753 PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4754 PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4755 PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4756 PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4757 PetscCheckFalse(Ccusp->cooPerm->size() != Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4758 auto pmid = Ccusp->cooPerm->begin(); 4759 thrust::advance(pmid,Acsr->num_entries); 4760 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4761 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4762 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4763 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4764 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4765 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4766 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4767 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4768 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4769 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4770 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4771 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4772 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4773 PetscCheckFalse(!Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4774 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4775 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4776 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4777 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4778 auto vT = CcsrT->values->begin(); 4779 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4780 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4781 (*C)->transupdated = PETSC_TRUE; 4782 } 4783 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4784 } 4785 } 4786 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4787 (*C)->assembled = PETSC_TRUE; 4788 (*C)->was_assembled = PETSC_FALSE; 4789 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4790 PetscFunctionReturn(0); 4791 } 4792 4793 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4794 { 4795 PetscErrorCode ierr; 4796 bool dmem; 4797 const PetscScalar *av; 4798 cudaError_t cerr; 4799 4800 PetscFunctionBegin; 4801 dmem = isCudaMem(v); 4802 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4803 if (n && idx) { 4804 THRUSTINTARRAY widx(n); 4805 widx.assign(idx,idx+n); 4806 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4807 4808 THRUSTARRAY *w = NULL; 4809 thrust::device_ptr<PetscScalar> dv; 4810 if (dmem) { 4811 dv = thrust::device_pointer_cast(v); 4812 } else { 4813 w = new THRUSTARRAY(n); 4814 dv = w->data(); 4815 } 4816 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4817 4818 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4819 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4820 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4821 if (w) { 4822 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4823 } 4824 delete w; 4825 } else { 4826 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4827 } 4828 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4829 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4830 PetscFunctionReturn(0); 4831 } 4832