1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 96 { 97 PetscFunctionBegin; 98 *type = MATSOLVERCUSPARSE; 99 PetscFunctionReturn(0); 100 } 101 102 /*MC 103 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 104 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 105 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 106 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 107 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 108 algorithms are not recommended. This class does NOT support direct solver operations. 109 110 Level: beginner 111 112 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 113 M*/ 114 115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 116 { 117 PetscInt n = A->rmap->n; 118 119 PetscFunctionBegin; 120 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 121 PetscCall(MatSetSizes(*B,n,n,n,n)); 122 (*B)->factortype = ftype; 123 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 124 125 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 126 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 127 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 128 if (!A->boundtocpu) { 129 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 130 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 131 } else { 132 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 133 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 134 } 135 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 136 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 137 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 138 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 139 if (!A->boundtocpu) { 140 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 141 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 142 } else { 143 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 144 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 145 } 146 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 147 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 148 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 149 150 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 151 (*B)->canuseordering = PETSC_TRUE; 152 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 153 PetscFunctionReturn(0); 154 } 155 156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 157 { 158 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 159 160 PetscFunctionBegin; 161 switch (op) { 162 case MAT_CUSPARSE_MULT: 163 cusparsestruct->format = format; 164 break; 165 case MAT_CUSPARSE_ALL: 166 cusparsestruct->format = format; 167 break; 168 default: 169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 170 } 171 PetscFunctionReturn(0); 172 } 173 174 /*@ 175 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 176 operation. Only the MatMult operation can use different GPU storage formats 177 for MPIAIJCUSPARSE matrices. 178 Not Collective 179 180 Input Parameters: 181 + A - Matrix of type SEQAIJCUSPARSE 182 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 183 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 184 185 Output Parameter: 186 187 Level: intermediate 188 189 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 190 @*/ 191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 192 { 193 PetscFunctionBegin; 194 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 195 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 196 PetscFunctionReturn(0); 197 } 198 199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 200 { 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202 203 PetscFunctionBegin; 204 cusparsestruct->use_cpu_solve = use_cpu; 205 PetscFunctionReturn(0); 206 } 207 208 /*@ 209 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 210 211 Input Parameters: 212 + A - Matrix of type SEQAIJCUSPARSE 213 - use_cpu - set flag for using the built-in CPU MatSolve 214 215 Output Parameter: 216 217 Notes: 218 The cuSparse LU solver currently computes the factors with the built-in CPU method 219 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 220 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 221 222 Level: intermediate 223 224 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 225 @*/ 226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 227 { 228 PetscFunctionBegin; 229 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 230 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 231 PetscFunctionReturn(0); 232 } 233 234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 235 { 236 PetscFunctionBegin; 237 switch (op) { 238 case MAT_FORM_EXPLICIT_TRANSPOSE: 239 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 240 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 241 A->form_explicit_transpose = flg; 242 break; 243 default: 244 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 245 break; 246 } 247 PetscFunctionReturn(0); 248 } 249 250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 251 252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 253 { 254 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 255 IS isrow = b->row,iscol = b->col; 256 PetscBool row_identity,col_identity; 257 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 258 259 PetscFunctionBegin; 260 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 261 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 262 B->offloadmask = PETSC_OFFLOAD_CPU; 263 /* determine which version of MatSolve needs to be used. */ 264 PetscCall(ISIdentity(isrow,&row_identity)); 265 PetscCall(ISIdentity(iscol,&col_identity)); 266 if (row_identity && col_identity) { 267 if (!cusparsestruct->use_cpu_solve) { 268 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 269 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 270 } 271 B->ops->matsolve = NULL; 272 B->ops->matsolvetranspose = NULL; 273 } else { 274 if (!cusparsestruct->use_cpu_solve) { 275 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 276 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 277 } 278 B->ops->matsolve = NULL; 279 B->ops->matsolvetranspose = NULL; 280 } 281 282 /* get the triangular factors */ 283 if (!cusparsestruct->use_cpu_solve) { 284 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 285 } 286 PetscFunctionReturn(0); 287 } 288 289 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 290 { 291 MatCUSPARSEStorageFormat format; 292 PetscBool flg; 293 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 294 295 PetscFunctionBegin; 296 PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 297 if (A->factortype == MAT_FACTOR_NONE) { 298 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 299 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 300 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 301 302 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 303 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 304 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 305 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 306 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 308 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 309 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 310 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 311 #if CUSPARSE_VERSION > 11301 312 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 313 #else 314 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 315 #endif 316 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 317 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 318 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 319 320 PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 321 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 322 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 323 #endif 324 } 325 PetscOptionsHeadEnd(); 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 330 { 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 332 333 PetscFunctionBegin; 334 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 335 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 336 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 341 { 342 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 343 344 PetscFunctionBegin; 345 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 346 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 347 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 348 PetscFunctionReturn(0); 349 } 350 351 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 352 { 353 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 354 355 PetscFunctionBegin; 356 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 357 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 358 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 359 PetscFunctionReturn(0); 360 } 361 362 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 363 { 364 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 365 366 PetscFunctionBegin; 367 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 368 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 369 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 370 PetscFunctionReturn(0); 371 } 372 373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 374 { 375 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 376 PetscInt n = A->rmap->n; 377 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 378 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 379 const PetscInt *ai = a->i,*aj = a->j,*vi; 380 const MatScalar *aa = a->a,*v; 381 PetscInt *AiLo, *AjLo; 382 PetscInt i,nz, nzLower, offset, rowOffset; 383 384 PetscFunctionBegin; 385 if (!n) PetscFunctionReturn(0); 386 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 387 try { 388 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 389 nzLower=n+ai[n]-ai[1]; 390 if (!loTriFactor) { 391 PetscScalar *AALo; 392 393 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 394 395 /* Allocate Space for the lower triangular matrix */ 396 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 397 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 398 399 /* Fill the lower triangular matrix */ 400 AiLo[0] = (PetscInt) 0; 401 AiLo[n] = nzLower; 402 AjLo[0] = (PetscInt) 0; 403 AALo[0] = (MatScalar) 1.0; 404 v = aa; 405 vi = aj; 406 offset = 1; 407 rowOffset= 1; 408 for (i=1; i<n; i++) { 409 nz = ai[i+1] - ai[i]; 410 /* additional 1 for the term on the diagonal */ 411 AiLo[i] = rowOffset; 412 rowOffset += nz+1; 413 414 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 415 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 416 417 offset += nz; 418 AjLo[offset] = (PetscInt) i; 419 AALo[offset] = (MatScalar) 1.0; 420 offset += 1; 421 422 v += nz; 423 vi += nz; 424 } 425 426 /* allocate space for the triangular factor information */ 427 PetscCall(PetscNew(&loTriFactor)); 428 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 429 /* Create the matrix description */ 430 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 431 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 432 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 433 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 434 #else 435 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 436 #endif 437 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 438 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 439 440 /* set the operation */ 441 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 442 443 /* set the matrix */ 444 loTriFactor->csrMat = new CsrMatrix; 445 loTriFactor->csrMat->num_rows = n; 446 loTriFactor->csrMat->num_cols = n; 447 loTriFactor->csrMat->num_entries = nzLower; 448 449 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 450 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 451 452 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 453 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 454 455 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 456 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 457 458 /* Create the solve analysis information */ 459 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 460 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 462 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 463 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 464 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 465 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 466 &loTriFactor->solveBufferSize)); 467 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 468 #endif 469 470 /* perform the solve analysis */ 471 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 472 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 473 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 474 loTriFactor->csrMat->column_indices->data().get(), 475 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 476 loTriFactor->solveInfo, 477 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 478 #else 479 loTriFactor->solveInfo)); 480 #endif 481 PetscCallCUDA(WaitForCUDA()); 482 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 483 484 /* assign the pointer */ 485 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 486 loTriFactor->AA_h = AALo; 487 PetscCallCUDA(cudaFreeHost(AiLo)); 488 PetscCallCUDA(cudaFreeHost(AjLo)); 489 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 490 } else { /* update values only */ 491 if (!loTriFactor->AA_h) { 492 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 493 } 494 /* Fill the lower triangular matrix */ 495 loTriFactor->AA_h[0] = 1.0; 496 v = aa; 497 vi = aj; 498 offset = 1; 499 for (i=1; i<n; i++) { 500 nz = ai[i+1] - ai[i]; 501 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 502 offset += nz; 503 loTriFactor->AA_h[offset] = 1.0; 504 offset += 1; 505 v += nz; 506 } 507 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 508 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 509 } 510 } catch(char *ex) { 511 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 512 } 513 } 514 PetscFunctionReturn(0); 515 } 516 517 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 518 { 519 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 520 PetscInt n = A->rmap->n; 521 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 522 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 523 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 524 const MatScalar *aa = a->a,*v; 525 PetscInt *AiUp, *AjUp; 526 PetscInt i,nz, nzUpper, offset; 527 528 PetscFunctionBegin; 529 if (!n) PetscFunctionReturn(0); 530 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 531 try { 532 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 533 nzUpper = adiag[0]-adiag[n]; 534 if (!upTriFactor) { 535 PetscScalar *AAUp; 536 537 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 538 539 /* Allocate Space for the upper triangular matrix */ 540 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 541 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 542 543 /* Fill the upper triangular matrix */ 544 AiUp[0]=(PetscInt) 0; 545 AiUp[n]=nzUpper; 546 offset = nzUpper; 547 for (i=n-1; i>=0; i--) { 548 v = aa + adiag[i+1] + 1; 549 vi = aj + adiag[i+1] + 1; 550 551 /* number of elements NOT on the diagonal */ 552 nz = adiag[i] - adiag[i+1]-1; 553 554 /* decrement the offset */ 555 offset -= (nz+1); 556 557 /* first, set the diagonal elements */ 558 AjUp[offset] = (PetscInt) i; 559 AAUp[offset] = (MatScalar)1./v[nz]; 560 AiUp[i] = AiUp[i+1] - (nz+1); 561 562 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 563 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 564 } 565 566 /* allocate space for the triangular factor information */ 567 PetscCall(PetscNew(&upTriFactor)); 568 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 569 570 /* Create the matrix description */ 571 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 572 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 573 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 574 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 575 #else 576 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 577 #endif 578 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 579 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 580 581 /* set the operation */ 582 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 583 584 /* set the matrix */ 585 upTriFactor->csrMat = new CsrMatrix; 586 upTriFactor->csrMat->num_rows = n; 587 upTriFactor->csrMat->num_cols = n; 588 upTriFactor->csrMat->num_entries = nzUpper; 589 590 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 591 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 592 593 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 594 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 595 596 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 597 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 598 599 /* Create the solve analysis information */ 600 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 601 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 602 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 603 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 604 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 605 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 606 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 607 &upTriFactor->solveBufferSize)); 608 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 609 #endif 610 611 /* perform the solve analysis */ 612 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 613 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 614 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 615 upTriFactor->csrMat->column_indices->data().get(), 616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 617 upTriFactor->solveInfo, 618 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 619 #else 620 upTriFactor->solveInfo)); 621 #endif 622 PetscCallCUDA(WaitForCUDA()); 623 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 624 625 /* assign the pointer */ 626 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 627 upTriFactor->AA_h = AAUp; 628 PetscCallCUDA(cudaFreeHost(AiUp)); 629 PetscCallCUDA(cudaFreeHost(AjUp)); 630 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 631 } else { 632 if (!upTriFactor->AA_h) { 633 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 634 } 635 /* Fill the upper triangular matrix */ 636 offset = nzUpper; 637 for (i=n-1; i>=0; i--) { 638 v = aa + adiag[i+1] + 1; 639 640 /* number of elements NOT on the diagonal */ 641 nz = adiag[i] - adiag[i+1]-1; 642 643 /* decrement the offset */ 644 offset -= (nz+1); 645 646 /* first, set the diagonal elements */ 647 upTriFactor->AA_h[offset] = 1./v[nz]; 648 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 649 } 650 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 651 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 652 } 653 } catch(char *ex) { 654 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 655 } 656 } 657 PetscFunctionReturn(0); 658 } 659 660 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 661 { 662 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 663 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 664 IS isrow = a->row,iscol = a->icol; 665 PetscBool row_identity,col_identity; 666 PetscInt n = A->rmap->n; 667 668 PetscFunctionBegin; 669 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 670 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 671 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 672 673 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 674 cusparseTriFactors->nnz=a->nz; 675 676 A->offloadmask = PETSC_OFFLOAD_BOTH; 677 /* lower triangular indices */ 678 PetscCall(ISIdentity(isrow,&row_identity)); 679 if (!row_identity && !cusparseTriFactors->rpermIndices) { 680 const PetscInt *r; 681 682 PetscCall(ISGetIndices(isrow,&r)); 683 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 684 cusparseTriFactors->rpermIndices->assign(r, r+n); 685 PetscCall(ISRestoreIndices(isrow,&r)); 686 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 687 } 688 689 /* upper triangular indices */ 690 PetscCall(ISIdentity(iscol,&col_identity)); 691 if (!col_identity && !cusparseTriFactors->cpermIndices) { 692 const PetscInt *c; 693 694 PetscCall(ISGetIndices(iscol,&c)); 695 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 696 cusparseTriFactors->cpermIndices->assign(c, c+n); 697 PetscCall(ISRestoreIndices(iscol,&c)); 698 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 699 } 700 PetscFunctionReturn(0); 701 } 702 703 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 704 { 705 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 706 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 707 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 708 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 709 PetscInt *AiUp, *AjUp; 710 PetscScalar *AAUp; 711 PetscScalar *AALo; 712 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 713 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 714 const PetscInt *ai = b->i,*aj = b->j,*vj; 715 const MatScalar *aa = b->a,*v; 716 717 PetscFunctionBegin; 718 if (!n) PetscFunctionReturn(0); 719 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 720 try { 721 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 722 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 723 if (!upTriFactor && !loTriFactor) { 724 /* Allocate Space for the upper triangular matrix */ 725 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 726 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 727 728 /* Fill the upper triangular matrix */ 729 AiUp[0]=(PetscInt) 0; 730 AiUp[n]=nzUpper; 731 offset = 0; 732 for (i=0; i<n; i++) { 733 /* set the pointers */ 734 v = aa + ai[i]; 735 vj = aj + ai[i]; 736 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 737 738 /* first, set the diagonal elements */ 739 AjUp[offset] = (PetscInt) i; 740 AAUp[offset] = (MatScalar)1.0/v[nz]; 741 AiUp[i] = offset; 742 AALo[offset] = (MatScalar)1.0/v[nz]; 743 744 offset+=1; 745 if (nz>0) { 746 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 747 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 748 for (j=offset; j<offset+nz; j++) { 749 AAUp[j] = -AAUp[j]; 750 AALo[j] = AAUp[j]/v[nz]; 751 } 752 offset+=nz; 753 } 754 } 755 756 /* allocate space for the triangular factor information */ 757 PetscCall(PetscNew(&upTriFactor)); 758 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 759 760 /* Create the matrix description */ 761 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 762 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 763 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 764 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 765 #else 766 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 767 #endif 768 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 769 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 770 771 /* set the matrix */ 772 upTriFactor->csrMat = new CsrMatrix; 773 upTriFactor->csrMat->num_rows = A->rmap->n; 774 upTriFactor->csrMat->num_cols = A->cmap->n; 775 upTriFactor->csrMat->num_entries = a->nz; 776 777 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 778 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 779 780 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 781 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 782 783 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 784 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 785 786 /* set the operation */ 787 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 788 789 /* Create the solve analysis information */ 790 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 791 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 792 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 793 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 794 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 795 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 796 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 797 &upTriFactor->solveBufferSize)); 798 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 799 #endif 800 801 /* perform the solve analysis */ 802 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 803 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 804 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 805 upTriFactor->csrMat->column_indices->data().get(), 806 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 807 upTriFactor->solveInfo, 808 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 809 #else 810 upTriFactor->solveInfo)); 811 #endif 812 PetscCallCUDA(WaitForCUDA()); 813 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 814 815 /* assign the pointer */ 816 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 817 818 /* allocate space for the triangular factor information */ 819 PetscCall(PetscNew(&loTriFactor)); 820 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 821 822 /* Create the matrix description */ 823 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 824 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 825 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 826 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 827 #else 828 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 829 #endif 830 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 831 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 832 833 /* set the operation */ 834 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 835 836 /* set the matrix */ 837 loTriFactor->csrMat = new CsrMatrix; 838 loTriFactor->csrMat->num_rows = A->rmap->n; 839 loTriFactor->csrMat->num_cols = A->cmap->n; 840 loTriFactor->csrMat->num_entries = a->nz; 841 842 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 843 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 844 845 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 846 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 847 848 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 849 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 850 851 /* Create the solve analysis information */ 852 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 853 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 854 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 855 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 856 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 857 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 858 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 859 &loTriFactor->solveBufferSize)); 860 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 861 #endif 862 863 /* perform the solve analysis */ 864 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 865 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 866 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 867 loTriFactor->csrMat->column_indices->data().get(), 868 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 869 loTriFactor->solveInfo, 870 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 871 #else 872 loTriFactor->solveInfo)); 873 #endif 874 PetscCallCUDA(WaitForCUDA()); 875 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 876 877 /* assign the pointer */ 878 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 879 880 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 881 PetscCallCUDA(cudaFreeHost(AiUp)); 882 PetscCallCUDA(cudaFreeHost(AjUp)); 883 } else { 884 /* Fill the upper triangular matrix */ 885 offset = 0; 886 for (i=0; i<n; i++) { 887 /* set the pointers */ 888 v = aa + ai[i]; 889 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 890 891 /* first, set the diagonal elements */ 892 AAUp[offset] = 1.0/v[nz]; 893 AALo[offset] = 1.0/v[nz]; 894 895 offset+=1; 896 if (nz>0) { 897 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 898 for (j=offset; j<offset+nz; j++) { 899 AAUp[j] = -AAUp[j]; 900 AALo[j] = AAUp[j]/v[nz]; 901 } 902 offset+=nz; 903 } 904 } 905 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 906 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 908 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 909 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 910 } 911 PetscCallCUDA(cudaFreeHost(AAUp)); 912 PetscCallCUDA(cudaFreeHost(AALo)); 913 } catch(char *ex) { 914 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 915 } 916 } 917 PetscFunctionReturn(0); 918 } 919 920 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 921 { 922 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 923 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 924 IS ip = a->row; 925 PetscBool perm_identity; 926 PetscInt n = A->rmap->n; 927 928 PetscFunctionBegin; 929 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 930 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 931 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 932 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 933 934 A->offloadmask = PETSC_OFFLOAD_BOTH; 935 936 /* lower triangular indices */ 937 PetscCall(ISIdentity(ip,&perm_identity)); 938 if (!perm_identity) { 939 IS iip; 940 const PetscInt *irip,*rip; 941 942 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 943 PetscCall(ISGetIndices(iip,&irip)); 944 PetscCall(ISGetIndices(ip,&rip)); 945 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 946 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 947 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 948 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 949 PetscCall(ISRestoreIndices(iip,&irip)); 950 PetscCall(ISDestroy(&iip)); 951 PetscCall(ISRestoreIndices(ip,&rip)); 952 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 953 } 954 PetscFunctionReturn(0); 955 } 956 957 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 958 { 959 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 960 IS ip = b->row; 961 PetscBool perm_identity; 962 963 PetscFunctionBegin; 964 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 965 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 966 B->offloadmask = PETSC_OFFLOAD_CPU; 967 /* determine which version of MatSolve needs to be used. */ 968 PetscCall(ISIdentity(ip,&perm_identity)); 969 if (perm_identity) { 970 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 971 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 972 B->ops->matsolve = NULL; 973 B->ops->matsolvetranspose = NULL; 974 } else { 975 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 976 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 977 B->ops->matsolve = NULL; 978 B->ops->matsolvetranspose = NULL; 979 } 980 981 /* get the triangular factors */ 982 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 983 PetscFunctionReturn(0); 984 } 985 986 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 987 { 988 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 989 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 990 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 991 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 992 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 993 cusparseIndexBase_t indexBase; 994 cusparseMatrixType_t matrixType; 995 cusparseFillMode_t fillMode; 996 cusparseDiagType_t diagType; 997 998 PetscFunctionBegin; 999 /* allocate space for the transpose of the lower triangular factor */ 1000 PetscCall(PetscNew(&loTriFactorT)); 1001 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1002 1003 /* set the matrix descriptors of the lower triangular factor */ 1004 matrixType = cusparseGetMatType(loTriFactor->descr); 1005 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1006 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1007 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1008 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1009 1010 /* Create the matrix description */ 1011 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1012 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1013 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1014 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1015 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1016 1017 /* set the operation */ 1018 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1019 1020 /* allocate GPU space for the CSC of the lower triangular factor*/ 1021 loTriFactorT->csrMat = new CsrMatrix; 1022 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1023 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1024 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1025 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1026 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1027 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1028 1029 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1031 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1032 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1033 loTriFactor->csrMat->values->data().get(), 1034 loTriFactor->csrMat->row_offsets->data().get(), 1035 loTriFactor->csrMat->column_indices->data().get(), 1036 loTriFactorT->csrMat->values->data().get(), 1037 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1038 CUSPARSE_ACTION_NUMERIC,indexBase, 1039 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1040 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1041 #endif 1042 1043 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1044 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1045 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1046 loTriFactor->csrMat->values->data().get(), 1047 loTriFactor->csrMat->row_offsets->data().get(), 1048 loTriFactor->csrMat->column_indices->data().get(), 1049 loTriFactorT->csrMat->values->data().get(), 1050 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC, indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1054 #else 1055 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1056 CUSPARSE_ACTION_NUMERIC, indexBase)); 1057 #endif 1058 PetscCallCUDA(WaitForCUDA()); 1059 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1060 1061 /* Create the solve analysis information */ 1062 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1063 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 1064 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1065 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1066 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1067 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1069 &loTriFactorT->solveBufferSize)); 1070 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1071 #endif 1072 1073 /* perform the solve analysis */ 1074 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1075 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1076 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077 loTriFactorT->csrMat->column_indices->data().get(), 1078 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1079 loTriFactorT->solveInfo, 1080 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1081 #else 1082 loTriFactorT->solveInfo)); 1083 #endif 1084 PetscCallCUDA(WaitForCUDA()); 1085 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1086 1087 /* assign the pointer */ 1088 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1089 1090 /*********************************************/ 1091 /* Now the Transpose of the Upper Tri Factor */ 1092 /*********************************************/ 1093 1094 /* allocate space for the transpose of the upper triangular factor */ 1095 PetscCall(PetscNew(&upTriFactorT)); 1096 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1097 1098 /* set the matrix descriptors of the upper triangular factor */ 1099 matrixType = cusparseGetMatType(upTriFactor->descr); 1100 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1101 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1102 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1103 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1104 1105 /* Create the matrix description */ 1106 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1107 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1108 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1109 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1110 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1111 1112 /* set the operation */ 1113 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1114 1115 /* allocate GPU space for the CSC of the upper triangular factor*/ 1116 upTriFactorT->csrMat = new CsrMatrix; 1117 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1118 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1119 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1120 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1121 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1122 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1123 1124 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1125 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1126 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1127 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1128 upTriFactor->csrMat->values->data().get(), 1129 upTriFactor->csrMat->row_offsets->data().get(), 1130 upTriFactor->csrMat->column_indices->data().get(), 1131 upTriFactorT->csrMat->values->data().get(), 1132 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1133 CUSPARSE_ACTION_NUMERIC,indexBase, 1134 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1135 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1136 #endif 1137 1138 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1139 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1140 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141 upTriFactor->csrMat->values->data().get(), 1142 upTriFactor->csrMat->row_offsets->data().get(), 1143 upTriFactor->csrMat->column_indices->data().get(), 1144 upTriFactorT->csrMat->values->data().get(), 1145 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1146 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1147 CUSPARSE_ACTION_NUMERIC, indexBase, 1148 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1149 #else 1150 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1151 CUSPARSE_ACTION_NUMERIC, indexBase)); 1152 #endif 1153 1154 PetscCallCUDA(WaitForCUDA()); 1155 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1156 1157 /* Create the solve analysis information */ 1158 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1159 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 1160 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1161 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1162 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1163 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1165 &upTriFactorT->solveBufferSize)); 1166 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1167 #endif 1168 1169 /* perform the solve analysis */ 1170 /* christ, would it have killed you to put this stuff in a function????????? */ 1171 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1172 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1173 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1174 upTriFactorT->csrMat->column_indices->data().get(), 1175 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1176 upTriFactorT->solveInfo, 1177 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1178 #else 1179 upTriFactorT->solveInfo)); 1180 #endif 1181 1182 PetscCallCUDA(WaitForCUDA()); 1183 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1184 1185 /* assign the pointer */ 1186 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1187 PetscFunctionReturn(0); 1188 } 1189 1190 struct PetscScalarToPetscInt 1191 { 1192 __host__ __device__ 1193 PetscInt operator()(PetscScalar s) 1194 { 1195 return (PetscInt)PetscRealPart(s); 1196 } 1197 }; 1198 1199 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1200 { 1201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1202 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1203 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1204 cusparseStatus_t stat; 1205 cusparseIndexBase_t indexBase; 1206 1207 PetscFunctionBegin; 1208 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1209 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1210 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1211 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1212 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1213 if (A->transupdated) PetscFunctionReturn(0); 1214 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1215 PetscCall(PetscLogGpuTimeBegin()); 1216 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1217 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1218 } 1219 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1220 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1221 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1222 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1223 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1224 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1225 1226 /* set alpha and beta */ 1227 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1228 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1229 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1230 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1231 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1232 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1233 1234 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1235 CsrMatrix *matrixT = new CsrMatrix; 1236 matstructT->mat = matrixT; 1237 matrixT->num_rows = A->cmap->n; 1238 matrixT->num_cols = A->rmap->n; 1239 matrixT->num_entries = a->nz; 1240 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1241 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1242 matrixT->values = new THRUSTARRAY(a->nz); 1243 1244 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1245 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1246 1247 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1248 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1249 stat = cusparseCreateCsr(&matstructT->matDescr, 1250 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1251 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1252 matrixT->values->data().get(), 1253 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1254 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1255 #else 1256 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1257 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1258 1259 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1260 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1261 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1262 */ 1263 if (matrixT->num_entries) { 1264 stat = cusparseCreateCsr(&matstructT->matDescr, 1265 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1266 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1267 matrixT->values->data().get(), 1268 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1269 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1270 1271 } else { 1272 matstructT->matDescr = NULL; 1273 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1274 } 1275 #endif 1276 #endif 1277 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1278 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1279 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1280 #else 1281 CsrMatrix *temp = new CsrMatrix; 1282 CsrMatrix *tempT = new CsrMatrix; 1283 /* First convert HYB to CSR */ 1284 temp->num_rows = A->rmap->n; 1285 temp->num_cols = A->cmap->n; 1286 temp->num_entries = a->nz; 1287 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1288 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1289 temp->values = new THRUSTARRAY(a->nz); 1290 1291 stat = cusparse_hyb2csr(cusparsestruct->handle, 1292 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1293 temp->values->data().get(), 1294 temp->row_offsets->data().get(), 1295 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1296 1297 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1298 tempT->num_rows = A->rmap->n; 1299 tempT->num_cols = A->cmap->n; 1300 tempT->num_entries = a->nz; 1301 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1302 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1303 tempT->values = new THRUSTARRAY(a->nz); 1304 1305 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1306 temp->num_cols, temp->num_entries, 1307 temp->values->data().get(), 1308 temp->row_offsets->data().get(), 1309 temp->column_indices->data().get(), 1310 tempT->values->data().get(), 1311 tempT->column_indices->data().get(), 1312 tempT->row_offsets->data().get(), 1313 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1314 1315 /* Last, convert CSC to HYB */ 1316 cusparseHybMat_t hybMat; 1317 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1318 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1319 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1320 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1321 matstructT->descr, tempT->values->data().get(), 1322 tempT->row_offsets->data().get(), 1323 tempT->column_indices->data().get(), 1324 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1325 1326 /* assign the pointer */ 1327 matstructT->mat = hybMat; 1328 A->transupdated = PETSC_TRUE; 1329 /* delete temporaries */ 1330 if (tempT) { 1331 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1332 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1333 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1334 delete (CsrMatrix*) tempT; 1335 } 1336 if (temp) { 1337 if (temp->values) delete (THRUSTARRAY*) temp->values; 1338 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1339 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1340 delete (CsrMatrix*) temp; 1341 } 1342 #endif 1343 } 1344 } 1345 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1346 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1347 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1348 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1349 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1350 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1351 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1352 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1353 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1354 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1355 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1356 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1357 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1358 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1359 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1360 } 1361 if (!cusparsestruct->csr2csc_i) { 1362 THRUSTARRAY csr2csc_a(matrix->num_entries); 1363 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1364 1365 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1366 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1367 void *csr2cscBuffer; 1368 size_t csr2cscBufferSize; 1369 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1370 A->cmap->n, matrix->num_entries, 1371 matrix->values->data().get(), 1372 cusparsestruct->rowoffsets_gpu->data().get(), 1373 matrix->column_indices->data().get(), 1374 matrixT->values->data().get(), 1375 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1376 CUSPARSE_ACTION_NUMERIC,indexBase, 1377 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1378 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1379 #endif 1380 1381 if (matrix->num_entries) { 1382 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1383 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1384 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1385 1386 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1387 should be filled with indexBase. So I just take a shortcut here. 1388 */ 1389 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1390 A->cmap->n,matrix->num_entries, 1391 csr2csc_a.data().get(), 1392 cusparsestruct->rowoffsets_gpu->data().get(), 1393 matrix->column_indices->data().get(), 1394 matrixT->values->data().get(), 1395 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1396 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1397 CUSPARSE_ACTION_NUMERIC,indexBase, 1398 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1399 #else 1400 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1401 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1402 #endif 1403 } else { 1404 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1405 } 1406 1407 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1408 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1409 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1410 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1411 #endif 1412 } 1413 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1414 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1415 matrixT->values->begin())); 1416 } 1417 PetscCall(PetscLogGpuTimeEnd()); 1418 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1419 /* the compressed row indices is not used for matTranspose */ 1420 matstructT->cprowIndices = NULL; 1421 /* assign the pointer */ 1422 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1423 A->transupdated = PETSC_TRUE; 1424 PetscFunctionReturn(0); 1425 } 1426 1427 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1428 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1429 { 1430 PetscInt n = xx->map->n; 1431 const PetscScalar *barray; 1432 PetscScalar *xarray; 1433 thrust::device_ptr<const PetscScalar> bGPU; 1434 thrust::device_ptr<PetscScalar> xGPU; 1435 cusparseStatus_t stat; 1436 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1437 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1438 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1439 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1440 1441 PetscFunctionBegin; 1442 /* Analyze the matrix and create the transpose ... on the fly */ 1443 if (!loTriFactorT && !upTriFactorT) { 1444 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1445 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1446 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1447 } 1448 1449 /* Get the GPU pointers */ 1450 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1451 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1452 xGPU = thrust::device_pointer_cast(xarray); 1453 bGPU = thrust::device_pointer_cast(barray); 1454 1455 PetscCall(PetscLogGpuTimeBegin()); 1456 /* First, reorder with the row permutation */ 1457 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1458 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1459 xGPU); 1460 1461 /* First, solve U */ 1462 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1463 upTriFactorT->csrMat->num_rows, 1464 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1465 upTriFactorT->csrMat->num_entries, 1466 #endif 1467 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1468 upTriFactorT->csrMat->values->data().get(), 1469 upTriFactorT->csrMat->row_offsets->data().get(), 1470 upTriFactorT->csrMat->column_indices->data().get(), 1471 upTriFactorT->solveInfo, 1472 xarray, 1473 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1474 tempGPU->data().get(), 1475 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1476 #else 1477 tempGPU->data().get());PetscCallCUSPARSE(stat); 1478 #endif 1479 1480 /* Then, solve L */ 1481 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1482 loTriFactorT->csrMat->num_rows, 1483 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484 loTriFactorT->csrMat->num_entries, 1485 #endif 1486 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1487 loTriFactorT->csrMat->values->data().get(), 1488 loTriFactorT->csrMat->row_offsets->data().get(), 1489 loTriFactorT->csrMat->column_indices->data().get(), 1490 loTriFactorT->solveInfo, 1491 tempGPU->data().get(), 1492 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1493 xarray, 1494 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1495 #else 1496 xarray);PetscCallCUSPARSE(stat); 1497 #endif 1498 1499 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1500 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1501 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1502 tempGPU->begin()); 1503 1504 /* Copy the temporary to the full solution. */ 1505 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1506 1507 /* restore */ 1508 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1509 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1510 PetscCall(PetscLogGpuTimeEnd()); 1511 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1512 PetscFunctionReturn(0); 1513 } 1514 1515 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1516 { 1517 const PetscScalar *barray; 1518 PetscScalar *xarray; 1519 cusparseStatus_t stat; 1520 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1521 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1522 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1523 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1524 1525 PetscFunctionBegin; 1526 /* Analyze the matrix and create the transpose ... on the fly */ 1527 if (!loTriFactorT && !upTriFactorT) { 1528 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1529 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1530 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1531 } 1532 1533 /* Get the GPU pointers */ 1534 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1535 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1536 1537 PetscCall(PetscLogGpuTimeBegin()); 1538 /* First, solve U */ 1539 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1540 upTriFactorT->csrMat->num_rows, 1541 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1542 upTriFactorT->csrMat->num_entries, 1543 #endif 1544 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1545 upTriFactorT->csrMat->values->data().get(), 1546 upTriFactorT->csrMat->row_offsets->data().get(), 1547 upTriFactorT->csrMat->column_indices->data().get(), 1548 upTriFactorT->solveInfo, 1549 barray, 1550 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1551 tempGPU->data().get(), 1552 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1553 #else 1554 tempGPU->data().get());PetscCallCUSPARSE(stat); 1555 #endif 1556 1557 /* Then, solve L */ 1558 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1559 loTriFactorT->csrMat->num_rows, 1560 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1561 loTriFactorT->csrMat->num_entries, 1562 #endif 1563 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1564 loTriFactorT->csrMat->values->data().get(), 1565 loTriFactorT->csrMat->row_offsets->data().get(), 1566 loTriFactorT->csrMat->column_indices->data().get(), 1567 loTriFactorT->solveInfo, 1568 tempGPU->data().get(), 1569 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1570 xarray, 1571 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1572 #else 1573 xarray);PetscCallCUSPARSE(stat); 1574 #endif 1575 1576 /* restore */ 1577 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1578 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1579 PetscCall(PetscLogGpuTimeEnd()); 1580 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1581 PetscFunctionReturn(0); 1582 } 1583 1584 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1585 { 1586 const PetscScalar *barray; 1587 PetscScalar *xarray; 1588 thrust::device_ptr<const PetscScalar> bGPU; 1589 thrust::device_ptr<PetscScalar> xGPU; 1590 cusparseStatus_t stat; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1595 1596 PetscFunctionBegin; 1597 1598 /* Get the GPU pointers */ 1599 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1600 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1601 xGPU = thrust::device_pointer_cast(xarray); 1602 bGPU = thrust::device_pointer_cast(barray); 1603 1604 PetscCall(PetscLogGpuTimeBegin()); 1605 /* First, reorder with the row permutation */ 1606 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1607 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1608 tempGPU->begin()); 1609 1610 /* Next, solve L */ 1611 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1612 loTriFactor->csrMat->num_rows, 1613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1614 loTriFactor->csrMat->num_entries, 1615 #endif 1616 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1617 loTriFactor->csrMat->values->data().get(), 1618 loTriFactor->csrMat->row_offsets->data().get(), 1619 loTriFactor->csrMat->column_indices->data().get(), 1620 loTriFactor->solveInfo, 1621 tempGPU->data().get(), 1622 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623 xarray, 1624 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1625 #else 1626 xarray);PetscCallCUSPARSE(stat); 1627 #endif 1628 1629 /* Then, solve U */ 1630 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1631 upTriFactor->csrMat->num_rows, 1632 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1633 upTriFactor->csrMat->num_entries, 1634 #endif 1635 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1636 upTriFactor->csrMat->values->data().get(), 1637 upTriFactor->csrMat->row_offsets->data().get(), 1638 upTriFactor->csrMat->column_indices->data().get(), 1639 upTriFactor->solveInfo,xarray, 1640 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1641 tempGPU->data().get(), 1642 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1643 #else 1644 tempGPU->data().get());PetscCallCUSPARSE(stat); 1645 #endif 1646 1647 /* Last, reorder with the column permutation */ 1648 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1649 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1650 xGPU); 1651 1652 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1653 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1654 PetscCall(PetscLogGpuTimeEnd()); 1655 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1660 { 1661 const PetscScalar *barray; 1662 PetscScalar *xarray; 1663 cusparseStatus_t stat; 1664 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1665 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1666 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1667 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1668 1669 PetscFunctionBegin; 1670 /* Get the GPU pointers */ 1671 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1672 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1673 1674 PetscCall(PetscLogGpuTimeBegin()); 1675 /* First, solve L */ 1676 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1677 loTriFactor->csrMat->num_rows, 1678 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679 loTriFactor->csrMat->num_entries, 1680 #endif 1681 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1682 loTriFactor->csrMat->values->data().get(), 1683 loTriFactor->csrMat->row_offsets->data().get(), 1684 loTriFactor->csrMat->column_indices->data().get(), 1685 loTriFactor->solveInfo, 1686 barray, 1687 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1688 tempGPU->data().get(), 1689 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1690 #else 1691 tempGPU->data().get());PetscCallCUSPARSE(stat); 1692 #endif 1693 1694 /* Next, solve U */ 1695 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1696 upTriFactor->csrMat->num_rows, 1697 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698 upTriFactor->csrMat->num_entries, 1699 #endif 1700 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1701 upTriFactor->csrMat->values->data().get(), 1702 upTriFactor->csrMat->row_offsets->data().get(), 1703 upTriFactor->csrMat->column_indices->data().get(), 1704 upTriFactor->solveInfo, 1705 tempGPU->data().get(), 1706 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1707 xarray, 1708 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1709 #else 1710 xarray);PetscCallCUSPARSE(stat); 1711 #endif 1712 1713 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1714 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1715 PetscCall(PetscLogGpuTimeEnd()); 1716 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1717 PetscFunctionReturn(0); 1718 } 1719 1720 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1721 { 1722 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1723 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1724 1725 PetscFunctionBegin; 1726 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1727 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1728 1729 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1730 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 1731 PetscCallCUDA(WaitForCUDA()); 1732 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 1733 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1734 A->offloadmask = PETSC_OFFLOAD_BOTH; 1735 } 1736 PetscFunctionReturn(0); 1737 } 1738 1739 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1740 { 1741 PetscFunctionBegin; 1742 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1743 *array = ((Mat_SeqAIJ*)A->data)->a; 1744 PetscFunctionReturn(0); 1745 } 1746 1747 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1748 { 1749 PetscFunctionBegin; 1750 A->offloadmask = PETSC_OFFLOAD_CPU; 1751 *array = NULL; 1752 PetscFunctionReturn(0); 1753 } 1754 1755 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1756 { 1757 PetscFunctionBegin; 1758 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1759 *array = ((Mat_SeqAIJ*)A->data)->a; 1760 PetscFunctionReturn(0); 1761 } 1762 1763 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1764 { 1765 PetscFunctionBegin; 1766 *array = NULL; 1767 PetscFunctionReturn(0); 1768 } 1769 1770 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1771 { 1772 PetscFunctionBegin; 1773 *array = ((Mat_SeqAIJ*)A->data)->a; 1774 PetscFunctionReturn(0); 1775 } 1776 1777 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1778 { 1779 PetscFunctionBegin; 1780 A->offloadmask = PETSC_OFFLOAD_CPU; 1781 *array = NULL; 1782 PetscFunctionReturn(0); 1783 } 1784 1785 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 1786 { 1787 Mat_SeqAIJCUSPARSE *cusp; 1788 CsrMatrix *matrix; 1789 1790 PetscFunctionBegin; 1791 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1792 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1793 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 1794 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 1795 matrix = (CsrMatrix*)cusp->mat->mat; 1796 1797 if (i) { 1798 #if !defined(PETSC_USE_64BIT_INDICES) 1799 *i = matrix->row_offsets->data().get(); 1800 #else 1801 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1802 #endif 1803 } 1804 if (j) { 1805 #if !defined(PETSC_USE_64BIT_INDICES) 1806 *j = matrix->column_indices->data().get(); 1807 #else 1808 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1809 #endif 1810 } 1811 if (a) *a = matrix->values->data().get(); 1812 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 1813 PetscFunctionReturn(0); 1814 } 1815 1816 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1817 { 1818 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1819 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1820 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1821 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1822 cusparseStatus_t stat; 1823 PetscBool both = PETSC_TRUE; 1824 1825 PetscFunctionBegin; 1826 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1827 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1828 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1829 CsrMatrix *matrix; 1830 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1831 1832 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1833 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1834 matrix->values->assign(a->a, a->a+a->nz); 1835 PetscCallCUDA(WaitForCUDA()); 1836 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 1837 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1838 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 1839 } else { 1840 PetscInt nnz; 1841 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1842 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 1843 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1844 delete cusparsestruct->workVector; 1845 delete cusparsestruct->rowoffsets_gpu; 1846 cusparsestruct->workVector = NULL; 1847 cusparsestruct->rowoffsets_gpu = NULL; 1848 try { 1849 if (a->compressedrow.use) { 1850 m = a->compressedrow.nrows; 1851 ii = a->compressedrow.i; 1852 ridx = a->compressedrow.rindex; 1853 } else { 1854 m = A->rmap->n; 1855 ii = a->i; 1856 ridx = NULL; 1857 } 1858 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1859 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1860 else nnz = a->nz; 1861 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1862 1863 /* create cusparse matrix */ 1864 cusparsestruct->nrows = m; 1865 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1866 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 1867 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 1868 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1869 1870 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 1871 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 1872 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 1873 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1874 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1875 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1876 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1877 1878 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1879 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1880 /* set the matrix */ 1881 CsrMatrix *mat= new CsrMatrix; 1882 mat->num_rows = m; 1883 mat->num_cols = A->cmap->n; 1884 mat->num_entries = nnz; 1885 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1886 mat->row_offsets->assign(ii, ii + m+1); 1887 1888 mat->column_indices = new THRUSTINTARRAY32(nnz); 1889 mat->column_indices->assign(a->j, a->j+nnz); 1890 1891 mat->values = new THRUSTARRAY(nnz); 1892 if (a->a) mat->values->assign(a->a, a->a+nnz); 1893 1894 /* assign the pointer */ 1895 matstruct->mat = mat; 1896 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1897 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1898 stat = cusparseCreateCsr(&matstruct->matDescr, 1899 mat->num_rows, mat->num_cols, mat->num_entries, 1900 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1901 mat->values->data().get(), 1902 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1903 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1904 } 1905 #endif 1906 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1907 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1908 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1909 #else 1910 CsrMatrix *mat= new CsrMatrix; 1911 mat->num_rows = m; 1912 mat->num_cols = A->cmap->n; 1913 mat->num_entries = nnz; 1914 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1915 mat->row_offsets->assign(ii, ii + m+1); 1916 1917 mat->column_indices = new THRUSTINTARRAY32(nnz); 1918 mat->column_indices->assign(a->j, a->j+nnz); 1919 1920 mat->values = new THRUSTARRAY(nnz); 1921 if (a->a) mat->values->assign(a->a, a->a+nnz); 1922 1923 cusparseHybMat_t hybMat; 1924 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1925 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1926 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1927 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1928 matstruct->descr, mat->values->data().get(), 1929 mat->row_offsets->data().get(), 1930 mat->column_indices->data().get(), 1931 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1932 /* assign the pointer */ 1933 matstruct->mat = hybMat; 1934 1935 if (mat) { 1936 if (mat->values) delete (THRUSTARRAY*)mat->values; 1937 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1938 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1939 delete (CsrMatrix*)mat; 1940 } 1941 #endif 1942 } 1943 1944 /* assign the compressed row indices */ 1945 if (a->compressedrow.use) { 1946 cusparsestruct->workVector = new THRUSTARRAY(m); 1947 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1948 matstruct->cprowIndices->assign(ridx,ridx+m); 1949 tmp = m; 1950 } else { 1951 cusparsestruct->workVector = NULL; 1952 matstruct->cprowIndices = NULL; 1953 tmp = 0; 1954 } 1955 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1956 1957 /* assign the pointer */ 1958 cusparsestruct->mat = matstruct; 1959 } catch(char *ex) { 1960 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1961 } 1962 PetscCallCUDA(WaitForCUDA()); 1963 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1964 cusparsestruct->nonzerostate = A->nonzerostate; 1965 } 1966 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1967 } 1968 PetscFunctionReturn(0); 1969 } 1970 1971 struct VecCUDAPlusEquals 1972 { 1973 template <typename Tuple> 1974 __host__ __device__ 1975 void operator()(Tuple t) 1976 { 1977 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1978 } 1979 }; 1980 1981 struct VecCUDAEquals 1982 { 1983 template <typename Tuple> 1984 __host__ __device__ 1985 void operator()(Tuple t) 1986 { 1987 thrust::get<1>(t) = thrust::get<0>(t); 1988 } 1989 }; 1990 1991 struct VecCUDAEqualsReverse 1992 { 1993 template <typename Tuple> 1994 __host__ __device__ 1995 void operator()(Tuple t) 1996 { 1997 thrust::get<0>(t) = thrust::get<1>(t); 1998 } 1999 }; 2000 2001 struct MatMatCusparse { 2002 PetscBool cisdense; 2003 PetscScalar *Bt; 2004 Mat X; 2005 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2006 PetscLogDouble flops; 2007 CsrMatrix *Bcsr; 2008 2009 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2010 cusparseSpMatDescr_t matSpBDescr; 2011 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2012 cusparseDnMatDescr_t matBDescr; 2013 cusparseDnMatDescr_t matCDescr; 2014 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2015 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2016 void *dBuffer4; 2017 void *dBuffer5; 2018 #endif 2019 size_t mmBufferSize; 2020 void *mmBuffer; 2021 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2022 cusparseSpGEMMDescr_t spgemmDesc; 2023 #endif 2024 }; 2025 2026 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2027 { 2028 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2029 2030 PetscFunctionBegin; 2031 PetscCallCUDA(cudaFree(mmdata->Bt)); 2032 delete mmdata->Bcsr; 2033 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2034 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2035 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2036 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2037 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2038 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2039 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2040 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2041 #endif 2042 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2043 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2044 #endif 2045 PetscCall(MatDestroy(&mmdata->X)); 2046 PetscCall(PetscFree(data)); 2047 PetscFunctionReturn(0); 2048 } 2049 2050 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2051 2052 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2053 { 2054 Mat_Product *product = C->product; 2055 Mat A,B; 2056 PetscInt m,n,blda,clda; 2057 PetscBool flg,biscuda; 2058 Mat_SeqAIJCUSPARSE *cusp; 2059 cusparseStatus_t stat; 2060 cusparseOperation_t opA; 2061 const PetscScalar *barray; 2062 PetscScalar *carray; 2063 MatMatCusparse *mmdata; 2064 Mat_SeqAIJCUSPARSEMultStruct *mat; 2065 CsrMatrix *csrmat; 2066 2067 PetscFunctionBegin; 2068 MatCheckProduct(C,1); 2069 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2070 mmdata = (MatMatCusparse*)product->data; 2071 A = product->A; 2072 B = product->B; 2073 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2074 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2075 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2076 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2077 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2078 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2079 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2080 switch (product->type) { 2081 case MATPRODUCT_AB: 2082 case MATPRODUCT_PtAP: 2083 mat = cusp->mat; 2084 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2085 m = A->rmap->n; 2086 n = B->cmap->n; 2087 break; 2088 case MATPRODUCT_AtB: 2089 if (!A->form_explicit_transpose) { 2090 mat = cusp->mat; 2091 opA = CUSPARSE_OPERATION_TRANSPOSE; 2092 } else { 2093 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2094 mat = cusp->matTranspose; 2095 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2096 } 2097 m = A->cmap->n; 2098 n = B->cmap->n; 2099 break; 2100 case MATPRODUCT_ABt: 2101 case MATPRODUCT_RARt: 2102 mat = cusp->mat; 2103 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2104 m = A->rmap->n; 2105 n = B->rmap->n; 2106 break; 2107 default: 2108 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2109 } 2110 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2111 csrmat = (CsrMatrix*)mat->mat; 2112 /* if the user passed a CPU matrix, copy the data to the GPU */ 2113 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2114 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2115 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2116 2117 PetscCall(MatDenseGetLDA(B,&blda)); 2118 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2119 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2120 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2121 } else { 2122 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2123 PetscCall(MatDenseGetLDA(C,&clda)); 2124 } 2125 2126 PetscCall(PetscLogGpuTimeBegin()); 2127 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2128 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2129 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2130 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2131 size_t mmBufferSize; 2132 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2133 if (!mmdata->matBDescr) { 2134 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2135 mmdata->Blda = blda; 2136 } 2137 2138 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2139 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2140 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2141 mmdata->Clda = clda; 2142 } 2143 2144 if (!mat->matDescr) { 2145 stat = cusparseCreateCsr(&mat->matDescr, 2146 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2147 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2148 csrmat->values->data().get(), 2149 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2150 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2151 } 2152 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2153 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2154 mmdata->matCDescr,cusparse_scalartype, 2155 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2156 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2157 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2158 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2159 mmdata->mmBufferSize = mmBufferSize; 2160 } 2161 mmdata->initialized = PETSC_TRUE; 2162 } else { 2163 /* to be safe, always update pointers of the mats */ 2164 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2165 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2166 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2167 } 2168 2169 /* do cusparseSpMM, which supports transpose on B */ 2170 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2171 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2172 mmdata->matCDescr,cusparse_scalartype, 2173 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2174 #else 2175 PetscInt k; 2176 /* cusparseXcsrmm does not support transpose on B */ 2177 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2178 cublasHandle_t cublasv2handle; 2179 cublasStatus_t cerr; 2180 2181 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2182 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2183 B->cmap->n,B->rmap->n, 2184 &PETSC_CUSPARSE_ONE ,barray,blda, 2185 &PETSC_CUSPARSE_ZERO,barray,blda, 2186 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2187 blda = B->cmap->n; 2188 k = B->cmap->n; 2189 } else { 2190 k = B->rmap->n; 2191 } 2192 2193 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2194 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2195 csrmat->num_entries,mat->alpha_one,mat->descr, 2196 csrmat->values->data().get(), 2197 csrmat->row_offsets->data().get(), 2198 csrmat->column_indices->data().get(), 2199 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2200 carray,clda);PetscCallCUSPARSE(stat); 2201 #endif 2202 PetscCall(PetscLogGpuTimeEnd()); 2203 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2204 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2205 if (product->type == MATPRODUCT_RARt) { 2206 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2207 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2208 } else if (product->type == MATPRODUCT_PtAP) { 2209 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2210 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2211 } else { 2212 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2213 } 2214 if (mmdata->cisdense) { 2215 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2216 } 2217 if (!biscuda) { 2218 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2219 } 2220 PetscFunctionReturn(0); 2221 } 2222 2223 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2224 { 2225 Mat_Product *product = C->product; 2226 Mat A,B; 2227 PetscInt m,n; 2228 PetscBool cisdense,flg; 2229 MatMatCusparse *mmdata; 2230 Mat_SeqAIJCUSPARSE *cusp; 2231 2232 PetscFunctionBegin; 2233 MatCheckProduct(C,1); 2234 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2235 A = product->A; 2236 B = product->B; 2237 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2238 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2239 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2240 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2241 switch (product->type) { 2242 case MATPRODUCT_AB: 2243 m = A->rmap->n; 2244 n = B->cmap->n; 2245 break; 2246 case MATPRODUCT_AtB: 2247 m = A->cmap->n; 2248 n = B->cmap->n; 2249 break; 2250 case MATPRODUCT_ABt: 2251 m = A->rmap->n; 2252 n = B->rmap->n; 2253 break; 2254 case MATPRODUCT_PtAP: 2255 m = B->cmap->n; 2256 n = B->cmap->n; 2257 break; 2258 case MATPRODUCT_RARt: 2259 m = B->rmap->n; 2260 n = B->rmap->n; 2261 break; 2262 default: 2263 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2264 } 2265 PetscCall(MatSetSizes(C,m,n,m,n)); 2266 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2267 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 2268 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2269 2270 /* product data */ 2271 PetscCall(PetscNew(&mmdata)); 2272 mmdata->cisdense = cisdense; 2273 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2274 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2275 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2276 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2277 } 2278 #endif 2279 /* for these products we need intermediate storage */ 2280 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2281 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 2282 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2283 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2284 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2285 } else { 2286 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2287 } 2288 } 2289 C->product->data = mmdata; 2290 C->product->destroy = MatDestroy_MatMatCusparse; 2291 2292 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2293 PetscFunctionReturn(0); 2294 } 2295 2296 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2297 { 2298 Mat_Product *product = C->product; 2299 Mat A,B; 2300 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2301 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2302 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2303 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2304 PetscBool flg; 2305 cusparseStatus_t stat; 2306 MatProductType ptype; 2307 MatMatCusparse *mmdata; 2308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2309 cusparseSpMatDescr_t BmatSpDescr; 2310 #endif 2311 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2312 2313 PetscFunctionBegin; 2314 MatCheckProduct(C,1); 2315 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2316 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 2317 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2318 mmdata = (MatMatCusparse*)C->product->data; 2319 A = product->A; 2320 B = product->B; 2321 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2322 mmdata->reusesym = PETSC_FALSE; 2323 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2324 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2325 Cmat = Ccusp->mat; 2326 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2327 Ccsr = (CsrMatrix*)Cmat->mat; 2328 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2329 goto finalize; 2330 } 2331 if (!c->nz) goto finalize; 2332 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2333 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2334 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2335 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2336 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2337 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2338 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2339 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2340 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2341 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2342 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2343 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2344 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2345 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2346 2347 ptype = product->type; 2348 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2349 ptype = MATPRODUCT_AB; 2350 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2351 } 2352 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2353 ptype = MATPRODUCT_AB; 2354 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2355 } 2356 switch (ptype) { 2357 case MATPRODUCT_AB: 2358 Amat = Acusp->mat; 2359 Bmat = Bcusp->mat; 2360 break; 2361 case MATPRODUCT_AtB: 2362 Amat = Acusp->matTranspose; 2363 Bmat = Bcusp->mat; 2364 break; 2365 case MATPRODUCT_ABt: 2366 Amat = Acusp->mat; 2367 Bmat = Bcusp->matTranspose; 2368 break; 2369 default: 2370 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2371 } 2372 Cmat = Ccusp->mat; 2373 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2374 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2375 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2376 Acsr = (CsrMatrix*)Amat->mat; 2377 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2378 Ccsr = (CsrMatrix*)Cmat->mat; 2379 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2380 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2381 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2382 PetscCall(PetscLogGpuTimeBegin()); 2383 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2384 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2385 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2386 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2387 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2388 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2389 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2390 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2391 #else 2392 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2393 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2394 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2395 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2396 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2397 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2398 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2399 #endif 2400 #else 2401 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2402 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2403 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2404 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2405 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2406 #endif 2407 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2408 PetscCallCUDA(WaitForCUDA()); 2409 PetscCall(PetscLogGpuTimeEnd()); 2410 C->offloadmask = PETSC_OFFLOAD_GPU; 2411 finalize: 2412 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2413 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 2414 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 2415 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2416 c->reallocs = 0; 2417 C->info.mallocs += 0; 2418 C->info.nz_unneeded = 0; 2419 C->assembled = C->was_assembled = PETSC_TRUE; 2420 C->num_ass++; 2421 PetscFunctionReturn(0); 2422 } 2423 2424 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2425 { 2426 Mat_Product *product = C->product; 2427 Mat A,B; 2428 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2429 Mat_SeqAIJ *a,*b,*c; 2430 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2431 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2432 PetscInt i,j,m,n,k; 2433 PetscBool flg; 2434 cusparseStatus_t stat; 2435 MatProductType ptype; 2436 MatMatCusparse *mmdata; 2437 PetscLogDouble flops; 2438 PetscBool biscompressed,ciscompressed; 2439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2440 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2441 cusparseSpMatDescr_t BmatSpDescr; 2442 #else 2443 int cnz; 2444 #endif 2445 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2446 2447 PetscFunctionBegin; 2448 MatCheckProduct(C,1); 2449 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2450 A = product->A; 2451 B = product->B; 2452 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2453 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2454 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2455 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2456 a = (Mat_SeqAIJ*)A->data; 2457 b = (Mat_SeqAIJ*)B->data; 2458 /* product data */ 2459 PetscCall(PetscNew(&mmdata)); 2460 C->product->data = mmdata; 2461 C->product->destroy = MatDestroy_MatMatCusparse; 2462 2463 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2464 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2465 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2466 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2467 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2468 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2469 2470 ptype = product->type; 2471 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2472 ptype = MATPRODUCT_AB; 2473 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2474 } 2475 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2476 ptype = MATPRODUCT_AB; 2477 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2478 } 2479 biscompressed = PETSC_FALSE; 2480 ciscompressed = PETSC_FALSE; 2481 switch (ptype) { 2482 case MATPRODUCT_AB: 2483 m = A->rmap->n; 2484 n = B->cmap->n; 2485 k = A->cmap->n; 2486 Amat = Acusp->mat; 2487 Bmat = Bcusp->mat; 2488 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2489 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2490 break; 2491 case MATPRODUCT_AtB: 2492 m = A->cmap->n; 2493 n = B->cmap->n; 2494 k = A->rmap->n; 2495 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2496 Amat = Acusp->matTranspose; 2497 Bmat = Bcusp->mat; 2498 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2499 break; 2500 case MATPRODUCT_ABt: 2501 m = A->rmap->n; 2502 n = B->rmap->n; 2503 k = A->cmap->n; 2504 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2505 Amat = Acusp->mat; 2506 Bmat = Bcusp->matTranspose; 2507 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2508 break; 2509 default: 2510 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2511 } 2512 2513 /* create cusparse matrix */ 2514 PetscCall(MatSetSizes(C,m,n,m,n)); 2515 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2516 c = (Mat_SeqAIJ*)C->data; 2517 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2518 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2519 Ccsr = new CsrMatrix; 2520 2521 c->compressedrow.use = ciscompressed; 2522 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2523 c->compressedrow.nrows = a->compressedrow.nrows; 2524 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 2525 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2526 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2527 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2528 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2529 } else { 2530 c->compressedrow.nrows = 0; 2531 c->compressedrow.i = NULL; 2532 c->compressedrow.rindex = NULL; 2533 Ccusp->workVector = NULL; 2534 Cmat->cprowIndices = NULL; 2535 } 2536 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2537 Ccusp->mat = Cmat; 2538 Ccusp->mat->mat = Ccsr; 2539 Ccsr->num_rows = Ccusp->nrows; 2540 Ccsr->num_cols = n; 2541 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2542 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2543 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2544 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2545 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 2546 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 2547 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2548 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2549 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2550 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2551 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2552 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2553 c->nz = 0; 2554 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2555 Ccsr->values = new THRUSTARRAY(c->nz); 2556 goto finalizesym; 2557 } 2558 2559 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2560 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2561 Acsr = (CsrMatrix*)Amat->mat; 2562 if (!biscompressed) { 2563 Bcsr = (CsrMatrix*)Bmat->mat; 2564 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2565 BmatSpDescr = Bmat->matDescr; 2566 #endif 2567 } else { /* we need to use row offsets for the full matrix */ 2568 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2569 Bcsr = new CsrMatrix; 2570 Bcsr->num_rows = B->rmap->n; 2571 Bcsr->num_cols = cBcsr->num_cols; 2572 Bcsr->num_entries = cBcsr->num_entries; 2573 Bcsr->column_indices = cBcsr->column_indices; 2574 Bcsr->values = cBcsr->values; 2575 if (!Bcusp->rowoffsets_gpu) { 2576 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2577 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2578 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2579 } 2580 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2581 mmdata->Bcsr = Bcsr; 2582 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2583 if (Bcsr->num_rows && Bcsr->num_cols) { 2584 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2585 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2586 Bcsr->values->data().get(), 2587 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2588 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2589 } 2590 BmatSpDescr = mmdata->matSpBDescr; 2591 #endif 2592 } 2593 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2594 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2595 /* precompute flops count */ 2596 if (ptype == MATPRODUCT_AB) { 2597 for (i=0, flops = 0; i<A->rmap->n; i++) { 2598 const PetscInt st = a->i[i]; 2599 const PetscInt en = a->i[i+1]; 2600 for (j=st; j<en; j++) { 2601 const PetscInt brow = a->j[j]; 2602 flops += 2.*(b->i[brow+1] - b->i[brow]); 2603 } 2604 } 2605 } else if (ptype == MATPRODUCT_AtB) { 2606 for (i=0, flops = 0; i<A->rmap->n; i++) { 2607 const PetscInt anzi = a->i[i+1] - a->i[i]; 2608 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2609 flops += (2.*anzi)*bnzi; 2610 } 2611 } else { /* TODO */ 2612 flops = 0.; 2613 } 2614 2615 mmdata->flops = flops; 2616 PetscCall(PetscLogGpuTimeBegin()); 2617 2618 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2619 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2620 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2621 NULL, NULL, NULL, 2622 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2623 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2624 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2625 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2626 { 2627 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2628 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2629 */ 2630 void* dBuffer1 = NULL; 2631 void* dBuffer2 = NULL; 2632 void* dBuffer3 = NULL; 2633 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2634 size_t bufferSize1 = 0; 2635 size_t bufferSize2 = 0; 2636 size_t bufferSize3 = 0; 2637 size_t bufferSize4 = 0; 2638 size_t bufferSize5 = 0; 2639 2640 /*----------------------------------------------------------------------*/ 2641 /* ask bufferSize1 bytes for external memory */ 2642 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2643 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2644 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 2645 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2646 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2647 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2648 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2649 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2650 2651 /*----------------------------------------------------------------------*/ 2652 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2653 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2654 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 2655 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 2656 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 2657 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2658 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2659 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2660 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 2661 PetscCallCUDA(cudaFree(dBuffer1)); 2662 PetscCallCUDA(cudaFree(dBuffer2)); 2663 2664 /*----------------------------------------------------------------------*/ 2665 /* get matrix C non-zero entries C_nnz1 */ 2666 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2667 c->nz = (PetscInt) C_nnz1; 2668 /* allocate matrix C */ 2669 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2670 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2671 /* update matC with the new pointers */ 2672 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2673 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2674 2675 /*----------------------------------------------------------------------*/ 2676 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2677 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2678 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 2679 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2680 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2681 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2682 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 2683 PetscCallCUDA(cudaFree(dBuffer3)); 2684 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2685 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2686 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2687 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2688 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2689 } 2690 #else 2691 size_t bufSize2; 2692 /* ask bufferSize bytes for external memory */ 2693 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2694 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2695 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2696 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 2697 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2698 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2699 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2700 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2701 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2702 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2703 /* ask bufferSize again bytes for external memory */ 2704 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2705 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2706 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2707 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2708 /* The CUSPARSE documentation is not clear, nor the API 2709 We need both buffers to perform the operations properly! 2710 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2711 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2712 is stored in the descriptor! What a messy API... */ 2713 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2714 /* compute the intermediate product of A * B */ 2715 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2716 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2717 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2718 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2719 /* get matrix C non-zero entries C_nnz1 */ 2720 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2721 c->nz = (PetscInt) C_nnz1; 2722 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2723 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2724 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2725 Ccsr->values = new THRUSTARRAY(c->nz); 2726 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2727 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2728 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2729 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2730 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2731 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2732 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2733 #else 2734 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2735 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2736 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2737 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2738 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2739 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2740 c->nz = cnz; 2741 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2742 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2743 Ccsr->values = new THRUSTARRAY(c->nz); 2744 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2745 2746 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2747 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2748 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2749 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2750 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2751 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2752 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2753 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2754 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2755 #endif 2756 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2757 PetscCall(PetscLogGpuTimeEnd()); 2758 finalizesym: 2759 c->singlemalloc = PETSC_FALSE; 2760 c->free_a = PETSC_TRUE; 2761 c->free_ij = PETSC_TRUE; 2762 PetscCall(PetscMalloc1(m+1,&c->i)); 2763 PetscCall(PetscMalloc1(c->nz,&c->j)); 2764 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2765 PetscInt *d_i = c->i; 2766 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2767 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2768 ii = *Ccsr->row_offsets; 2769 jj = *Ccsr->column_indices; 2770 if (ciscompressed) d_i = c->compressedrow.i; 2771 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2772 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2773 } else { 2774 PetscInt *d_i = c->i; 2775 if (ciscompressed) d_i = c->compressedrow.i; 2776 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2777 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2778 } 2779 if (ciscompressed) { /* need to expand host row offsets */ 2780 PetscInt r = 0; 2781 c->i[0] = 0; 2782 for (k = 0; k < c->compressedrow.nrows; k++) { 2783 const PetscInt next = c->compressedrow.rindex[k]; 2784 const PetscInt old = c->compressedrow.i[k]; 2785 for (; r < next; r++) c->i[r+1] = old; 2786 } 2787 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2788 } 2789 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 2790 PetscCall(PetscMalloc1(m,&c->ilen)); 2791 PetscCall(PetscMalloc1(m,&c->imax)); 2792 c->maxnz = c->nz; 2793 c->nonzerorowcnt = 0; 2794 c->rmax = 0; 2795 for (k = 0; k < m; k++) { 2796 const PetscInt nn = c->i[k+1] - c->i[k]; 2797 c->ilen[k] = c->imax[k] = nn; 2798 c->nonzerorowcnt += (PetscInt)!!nn; 2799 c->rmax = PetscMax(c->rmax,nn); 2800 } 2801 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 2802 PetscCall(PetscMalloc1(c->nz,&c->a)); 2803 Ccsr->num_entries = c->nz; 2804 2805 C->nonzerostate++; 2806 PetscCall(PetscLayoutSetUp(C->rmap)); 2807 PetscCall(PetscLayoutSetUp(C->cmap)); 2808 Ccusp->nonzerostate = C->nonzerostate; 2809 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2810 C->preallocated = PETSC_TRUE; 2811 C->assembled = PETSC_FALSE; 2812 C->was_assembled = PETSC_FALSE; 2813 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2814 mmdata->reusesym = PETSC_TRUE; 2815 C->offloadmask = PETSC_OFFLOAD_GPU; 2816 } 2817 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2818 PetscFunctionReturn(0); 2819 } 2820 2821 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2822 2823 /* handles sparse or dense B */ 2824 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2825 { 2826 Mat_Product *product = mat->product; 2827 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2828 2829 PetscFunctionBegin; 2830 MatCheckProduct(mat,1); 2831 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2832 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2833 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2834 } 2835 if (product->type == MATPRODUCT_ABC) { 2836 Ciscusp = PETSC_FALSE; 2837 if (!product->C->boundtocpu) { 2838 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2839 } 2840 } 2841 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2842 PetscBool usecpu = PETSC_FALSE; 2843 switch (product->type) { 2844 case MATPRODUCT_AB: 2845 if (product->api_user) { 2846 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 2847 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2848 PetscOptionsEnd(); 2849 } else { 2850 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 2851 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2852 PetscOptionsEnd(); 2853 } 2854 break; 2855 case MATPRODUCT_AtB: 2856 if (product->api_user) { 2857 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 2858 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2859 PetscOptionsEnd(); 2860 } else { 2861 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 2862 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2863 PetscOptionsEnd(); 2864 } 2865 break; 2866 case MATPRODUCT_PtAP: 2867 if (product->api_user) { 2868 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 2869 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2870 PetscOptionsEnd(); 2871 } else { 2872 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 2873 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2874 PetscOptionsEnd(); 2875 } 2876 break; 2877 case MATPRODUCT_RARt: 2878 if (product->api_user) { 2879 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 2880 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2881 PetscOptionsEnd(); 2882 } else { 2883 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 2884 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2885 PetscOptionsEnd(); 2886 } 2887 break; 2888 case MATPRODUCT_ABC: 2889 if (product->api_user) { 2890 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 2891 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2892 PetscOptionsEnd(); 2893 } else { 2894 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 2895 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2896 PetscOptionsEnd(); 2897 } 2898 break; 2899 default: 2900 break; 2901 } 2902 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2903 } 2904 /* dispatch */ 2905 if (isdense) { 2906 switch (product->type) { 2907 case MATPRODUCT_AB: 2908 case MATPRODUCT_AtB: 2909 case MATPRODUCT_ABt: 2910 case MATPRODUCT_PtAP: 2911 case MATPRODUCT_RARt: 2912 if (product->A->boundtocpu) { 2913 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2914 } else { 2915 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2916 } 2917 break; 2918 case MATPRODUCT_ABC: 2919 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2920 break; 2921 default: 2922 break; 2923 } 2924 } else if (Biscusp && Ciscusp) { 2925 switch (product->type) { 2926 case MATPRODUCT_AB: 2927 case MATPRODUCT_AtB: 2928 case MATPRODUCT_ABt: 2929 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2930 break; 2931 case MATPRODUCT_PtAP: 2932 case MATPRODUCT_RARt: 2933 case MATPRODUCT_ABC: 2934 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2935 break; 2936 default: 2937 break; 2938 } 2939 } else { /* fallback for AIJ */ 2940 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2941 } 2942 PetscFunctionReturn(0); 2943 } 2944 2945 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2946 { 2947 PetscFunctionBegin; 2948 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2949 PetscFunctionReturn(0); 2950 } 2951 2952 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2953 { 2954 PetscFunctionBegin; 2955 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2956 PetscFunctionReturn(0); 2957 } 2958 2959 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2960 { 2961 PetscFunctionBegin; 2962 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2963 PetscFunctionReturn(0); 2964 } 2965 2966 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2967 { 2968 PetscFunctionBegin; 2969 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 2970 PetscFunctionReturn(0); 2971 } 2972 2973 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2974 { 2975 PetscFunctionBegin; 2976 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2977 PetscFunctionReturn(0); 2978 } 2979 2980 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2981 { 2982 int i = blockIdx.x*blockDim.x + threadIdx.x; 2983 if (i < n) y[idx[i]] += x[i]; 2984 } 2985 2986 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2987 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2988 { 2989 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2990 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2991 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2992 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2993 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2994 PetscBool compressed; 2995 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2996 PetscInt nx,ny; 2997 #endif 2998 2999 PetscFunctionBegin; 3000 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3001 if (!a->nz) { 3002 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3003 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3004 PetscFunctionReturn(0); 3005 } 3006 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3007 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3008 if (!trans) { 3009 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3010 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3011 } else { 3012 if (herm || !A->form_explicit_transpose) { 3013 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3014 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3015 } else { 3016 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3017 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3018 } 3019 } 3020 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3021 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3022 3023 try { 3024 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3025 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3026 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3027 3028 PetscCall(PetscLogGpuTimeBegin()); 3029 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3030 /* z = A x + beta y. 3031 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3032 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3033 */ 3034 xptr = xarray; 3035 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3036 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3038 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3039 allocated to accommodate different uses. So we get the length info directly from mat. 3040 */ 3041 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3042 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3043 nx = mat->num_cols; 3044 ny = mat->num_rows; 3045 } 3046 #endif 3047 } else { 3048 /* z = A^T x + beta y 3049 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3050 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3051 */ 3052 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3053 dptr = zarray; 3054 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3055 if (compressed) { /* Scatter x to work vector */ 3056 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3057 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3058 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3059 VecCUDAEqualsReverse()); 3060 } 3061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3062 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3063 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3064 nx = mat->num_rows; 3065 ny = mat->num_cols; 3066 } 3067 #endif 3068 } 3069 3070 /* csr_spmv does y = alpha op(A) x + beta y */ 3071 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3072 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3073 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3074 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3075 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3076 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3077 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3078 matstruct->matDescr, 3079 matstruct->cuSpMV[opA].vecXDescr, beta, 3080 matstruct->cuSpMV[opA].vecYDescr, 3081 cusparse_scalartype, 3082 cusparsestruct->spmvAlg, 3083 &matstruct->cuSpMV[opA].spmvBufferSize)); 3084 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3085 3086 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3087 } else { 3088 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3089 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3090 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3091 } 3092 3093 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3094 matstruct->alpha_one, 3095 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3096 matstruct->cuSpMV[opA].vecXDescr, 3097 beta, 3098 matstruct->cuSpMV[opA].vecYDescr, 3099 cusparse_scalartype, 3100 cusparsestruct->spmvAlg, 3101 matstruct->cuSpMV[opA].spmvBuffer)); 3102 #else 3103 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3104 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3105 mat->num_rows, mat->num_cols, 3106 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3107 mat->values->data().get(), mat->row_offsets->data().get(), 3108 mat->column_indices->data().get(), xptr, beta, 3109 dptr)); 3110 #endif 3111 } else { 3112 if (cusparsestruct->nrows) { 3113 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3114 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3115 #else 3116 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3117 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3118 matstruct->alpha_one, matstruct->descr, hybMat, 3119 xptr, beta, 3120 dptr)); 3121 #endif 3122 } 3123 } 3124 PetscCall(PetscLogGpuTimeEnd()); 3125 3126 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3127 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3128 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3129 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3130 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3131 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3132 } 3133 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3134 PetscCall(VecSet_SeqCUDA(zz,0)); 3135 } 3136 3137 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3138 if (compressed) { 3139 PetscCall(PetscLogGpuTimeBegin()); 3140 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3141 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3142 prevent that. So I just add a ScatterAdd kernel. 3143 */ 3144 #if 0 3145 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3146 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3147 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3148 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3149 VecCUDAPlusEquals()); 3150 #else 3151 PetscInt n = matstruct->cprowIndices->size(); 3152 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3153 #endif 3154 PetscCall(PetscLogGpuTimeEnd()); 3155 } 3156 } else { 3157 if (yy && yy != zz) { 3158 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3159 } 3160 } 3161 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3162 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3163 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3164 } catch(char *ex) { 3165 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3166 } 3167 if (yy) { 3168 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3169 } else { 3170 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3171 } 3172 PetscFunctionReturn(0); 3173 } 3174 3175 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3176 { 3177 PetscFunctionBegin; 3178 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3179 PetscFunctionReturn(0); 3180 } 3181 3182 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3183 { 3184 PetscObjectState onnz = A->nonzerostate; 3185 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3186 3187 PetscFunctionBegin; 3188 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3189 if (onnz != A->nonzerostate && cusp->deviceMat) { 3190 3191 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3192 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3193 cusp->deviceMat = NULL; 3194 } 3195 PetscFunctionReturn(0); 3196 } 3197 3198 /* --------------------------------------------------------------------------------*/ 3199 /*@ 3200 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3201 (the default parallel PETSc format). This matrix will ultimately pushed down 3202 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3203 assembly performance the user should preallocate the matrix storage by setting 3204 the parameter nz (or the array nnz). By setting these parameters accurately, 3205 performance during matrix assembly can be increased by more than a factor of 50. 3206 3207 Collective 3208 3209 Input Parameters: 3210 + comm - MPI communicator, set to PETSC_COMM_SELF 3211 . m - number of rows 3212 . n - number of columns 3213 . nz - number of nonzeros per row (same for all rows) 3214 - nnz - array containing the number of nonzeros in the various rows 3215 (possibly different for each row) or NULL 3216 3217 Output Parameter: 3218 . A - the matrix 3219 3220 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3221 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3222 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3223 3224 Notes: 3225 If nnz is given then nz is ignored 3226 3227 The AIJ format (also called the Yale sparse matrix format or 3228 compressed row storage), is fully compatible with standard Fortran 77 3229 storage. That is, the stored row and column indices can begin at 3230 either one (as in Fortran) or zero. See the users' manual for details. 3231 3232 Specify the preallocated storage with either nz or nnz (not both). 3233 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3234 allocation. For large problems you MUST preallocate memory or you 3235 will get TERRIBLE performance, see the users' manual chapter on matrices. 3236 3237 By default, this format uses inodes (identical nodes) when possible, to 3238 improve numerical efficiency of matrix-vector products and solves. We 3239 search for consecutive rows with the same nonzero structure, thereby 3240 reusing matrix information to achieve increased efficiency. 3241 3242 Level: intermediate 3243 3244 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3245 @*/ 3246 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3247 { 3248 PetscFunctionBegin; 3249 PetscCall(MatCreate(comm,A)); 3250 PetscCall(MatSetSizes(*A,m,n,m,n)); 3251 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 3252 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3253 PetscFunctionReturn(0); 3254 } 3255 3256 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3257 { 3258 PetscFunctionBegin; 3259 if (A->factortype == MAT_FACTOR_NONE) { 3260 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 3261 } else { 3262 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3263 } 3264 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3265 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 3266 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 3267 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3268 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3269 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3270 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 3271 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3272 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3273 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 3274 PetscCall(MatDestroy_SeqAIJ(A)); 3275 PetscFunctionReturn(0); 3276 } 3277 3278 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3279 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3280 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3281 { 3282 PetscFunctionBegin; 3283 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 3284 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 3285 PetscFunctionReturn(0); 3286 } 3287 3288 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3289 { 3290 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3291 Mat_SeqAIJCUSPARSE *cy; 3292 Mat_SeqAIJCUSPARSE *cx; 3293 PetscScalar *ay; 3294 const PetscScalar *ax; 3295 CsrMatrix *csry,*csrx; 3296 3297 PetscFunctionBegin; 3298 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3299 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3300 if (X->ops->axpy != Y->ops->axpy) { 3301 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3302 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3303 PetscFunctionReturn(0); 3304 } 3305 /* if we are here, it means both matrices are bound to GPU */ 3306 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3307 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3308 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3309 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3310 csry = (CsrMatrix*)cy->mat->mat; 3311 csrx = (CsrMatrix*)cx->mat->mat; 3312 /* see if we can turn this into a cublas axpy */ 3313 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3314 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3315 if (eq) { 3316 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3317 } 3318 if (eq) str = SAME_NONZERO_PATTERN; 3319 } 3320 /* spgeam is buggy with one column */ 3321 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3322 3323 if (str == SUBSET_NONZERO_PATTERN) { 3324 PetscScalar b = 1.0; 3325 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3326 size_t bufferSize; 3327 void *buffer; 3328 #endif 3329 3330 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3331 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3332 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3333 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3334 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3335 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3336 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3337 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 3338 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 3339 PetscCall(PetscLogGpuTimeBegin()); 3340 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3341 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3342 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3343 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 3344 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3345 PetscCall(PetscLogGpuTimeEnd()); 3346 PetscCallCUDA(cudaFree(buffer)); 3347 #else 3348 PetscCall(PetscLogGpuTimeBegin()); 3349 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3350 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3351 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3352 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 3353 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3354 PetscCall(PetscLogGpuTimeEnd()); 3355 #endif 3356 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3357 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3358 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3359 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3360 } else if (str == SAME_NONZERO_PATTERN) { 3361 cublasHandle_t cublasv2handle; 3362 PetscBLASInt one = 1, bnz = 1; 3363 3364 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3365 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3366 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3367 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 3368 PetscCall(PetscLogGpuTimeBegin()); 3369 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 3370 PetscCall(PetscLogGpuFlops(2.0*bnz)); 3371 PetscCall(PetscLogGpuTimeEnd()); 3372 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3373 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3374 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3375 } else { 3376 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3377 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3378 } 3379 PetscFunctionReturn(0); 3380 } 3381 3382 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3383 { 3384 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3385 PetscScalar *ay; 3386 cublasHandle_t cublasv2handle; 3387 PetscBLASInt one = 1, bnz = 1; 3388 3389 PetscFunctionBegin; 3390 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3391 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3392 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 3393 PetscCall(PetscLogGpuTimeBegin()); 3394 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 3395 PetscCall(PetscLogGpuFlops(bnz)); 3396 PetscCall(PetscLogGpuTimeEnd()); 3397 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3398 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3399 PetscFunctionReturn(0); 3400 } 3401 3402 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3403 { 3404 PetscBool both = PETSC_FALSE; 3405 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3406 3407 PetscFunctionBegin; 3408 if (A->factortype == MAT_FACTOR_NONE) { 3409 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3410 if (spptr->mat) { 3411 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3412 if (matrix->values) { 3413 both = PETSC_TRUE; 3414 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3415 } 3416 } 3417 if (spptr->matTranspose) { 3418 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3419 if (matrix->values) { 3420 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3421 } 3422 } 3423 } 3424 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 3425 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3426 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3427 else A->offloadmask = PETSC_OFFLOAD_CPU; 3428 PetscFunctionReturn(0); 3429 } 3430 3431 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3432 { 3433 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3434 3435 PetscFunctionBegin; 3436 if (A->factortype != MAT_FACTOR_NONE) { 3437 A->boundtocpu = flg; 3438 PetscFunctionReturn(0); 3439 } 3440 if (flg) { 3441 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3442 3443 A->ops->scale = MatScale_SeqAIJ; 3444 A->ops->axpy = MatAXPY_SeqAIJ; 3445 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3446 A->ops->mult = MatMult_SeqAIJ; 3447 A->ops->multadd = MatMultAdd_SeqAIJ; 3448 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3449 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3450 A->ops->multhermitiantranspose = NULL; 3451 A->ops->multhermitiantransposeadd = NULL; 3452 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3453 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 3454 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3455 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3456 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3457 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3458 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3459 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3460 } else { 3461 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3462 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3463 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3464 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3465 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3466 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3467 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3468 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3469 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3470 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3471 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3472 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3473 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3474 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3475 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3476 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3477 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3478 3479 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3480 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3481 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3482 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3483 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 3484 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3485 } 3486 A->boundtocpu = flg; 3487 if (flg && a->inode.size) { 3488 a->inode.use = PETSC_TRUE; 3489 } else { 3490 a->inode.use = PETSC_FALSE; 3491 } 3492 PetscFunctionReturn(0); 3493 } 3494 3495 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3496 { 3497 Mat B; 3498 3499 PetscFunctionBegin; 3500 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3501 if (reuse == MAT_INITIAL_MATRIX) { 3502 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 3503 } else if (reuse == MAT_REUSE_MATRIX) { 3504 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 3505 } 3506 B = *newmat; 3507 3508 PetscCall(PetscFree(B->defaultvectype)); 3509 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 3510 3511 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3512 if (B->factortype == MAT_FACTOR_NONE) { 3513 Mat_SeqAIJCUSPARSE *spptr; 3514 PetscCall(PetscNew(&spptr)); 3515 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3516 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3517 spptr->format = MAT_CUSPARSE_CSR; 3518 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3519 #if CUSPARSE_VERSION > 11301 3520 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3521 #else 3522 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3523 #endif 3524 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3525 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3526 #endif 3527 B->spptr = spptr; 3528 } else { 3529 Mat_SeqAIJCUSPARSETriFactors *spptr; 3530 3531 PetscCall(PetscNew(&spptr)); 3532 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3533 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3534 B->spptr = spptr; 3535 } 3536 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3537 } 3538 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3539 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3540 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3541 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3542 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3543 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3544 3545 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 3546 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 3547 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3548 #if defined(PETSC_HAVE_HYPRE) 3549 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3550 #endif 3551 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3552 PetscFunctionReturn(0); 3553 } 3554 3555 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3556 { 3557 PetscFunctionBegin; 3558 PetscCall(MatCreate_SeqAIJ(B)); 3559 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 3560 PetscFunctionReturn(0); 3561 } 3562 3563 /*MC 3564 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3565 3566 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3567 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3568 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3569 3570 Options Database Keys: 3571 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3572 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3573 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3574 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3575 3576 Level: beginner 3577 3578 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3579 M*/ 3580 3581 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3582 3583 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3584 { 3585 PetscFunctionBegin; 3586 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 3587 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 3588 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 3589 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 3590 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3591 3592 PetscFunctionReturn(0); 3593 } 3594 3595 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3596 { 3597 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3598 3599 PetscFunctionBegin; 3600 if (!cusp) PetscFunctionReturn(0); 3601 delete cusp->cooPerm; 3602 delete cusp->cooPerm_a; 3603 cusp->cooPerm = NULL; 3604 cusp->cooPerm_a = NULL; 3605 if (cusp->use_extended_coo) { 3606 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3607 PetscCallCUDA(cudaFree(cusp->perm_d)); 3608 } 3609 cusp->use_extended_coo = PETSC_FALSE; 3610 PetscFunctionReturn(0); 3611 } 3612 3613 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3614 { 3615 PetscFunctionBegin; 3616 if (*cusparsestruct) { 3617 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 3618 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 3619 delete (*cusparsestruct)->workVector; 3620 delete (*cusparsestruct)->rowoffsets_gpu; 3621 delete (*cusparsestruct)->cooPerm; 3622 delete (*cusparsestruct)->cooPerm_a; 3623 delete (*cusparsestruct)->csr2csc_i; 3624 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3625 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3626 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3627 PetscCall(PetscFree(*cusparsestruct)); 3628 } 3629 PetscFunctionReturn(0); 3630 } 3631 3632 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3633 { 3634 PetscFunctionBegin; 3635 if (*mat) { 3636 delete (*mat)->values; 3637 delete (*mat)->column_indices; 3638 delete (*mat)->row_offsets; 3639 delete *mat; 3640 *mat = 0; 3641 } 3642 PetscFunctionReturn(0); 3643 } 3644 3645 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3646 { 3647 PetscFunctionBegin; 3648 if (*trifactor) { 3649 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3650 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 3651 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3652 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3653 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3654 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3655 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3656 #endif 3657 PetscCall(PetscFree(*trifactor)); 3658 } 3659 PetscFunctionReturn(0); 3660 } 3661 3662 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3663 { 3664 CsrMatrix *mat; 3665 3666 PetscFunctionBegin; 3667 if (*matstruct) { 3668 if ((*matstruct)->mat) { 3669 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3670 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3671 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3672 #else 3673 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3674 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3675 #endif 3676 } else { 3677 mat = (CsrMatrix*)(*matstruct)->mat; 3678 CsrMatrix_Destroy(&mat); 3679 } 3680 } 3681 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3682 delete (*matstruct)->cprowIndices; 3683 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3684 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3685 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3686 3687 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3688 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3689 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3690 for (int i=0; i<3; i++) { 3691 if (mdata->cuSpMV[i].initialized) { 3692 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3693 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3694 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3695 } 3696 } 3697 #endif 3698 delete *matstruct; 3699 *matstruct = NULL; 3700 } 3701 PetscFunctionReturn(0); 3702 } 3703 3704 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3705 { 3706 PetscFunctionBegin; 3707 if (*trifactors) { 3708 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 3709 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 3710 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 3711 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 3712 delete (*trifactors)->rpermIndices; 3713 delete (*trifactors)->cpermIndices; 3714 delete (*trifactors)->workVector; 3715 (*trifactors)->rpermIndices = NULL; 3716 (*trifactors)->cpermIndices = NULL; 3717 (*trifactors)->workVector = NULL; 3718 if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 3719 if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3720 (*trifactors)->init_dev_prop = PETSC_FALSE; 3721 } 3722 PetscFunctionReturn(0); 3723 } 3724 3725 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3726 { 3727 cusparseHandle_t handle; 3728 3729 PetscFunctionBegin; 3730 if (*trifactors) { 3731 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3732 if (handle = (*trifactors)->handle) { 3733 PetscCallCUSPARSE(cusparseDestroy(handle)); 3734 } 3735 PetscCall(PetscFree(*trifactors)); 3736 } 3737 PetscFunctionReturn(0); 3738 } 3739 3740 struct IJCompare 3741 { 3742 __host__ __device__ 3743 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3744 { 3745 if (t1.get<0>() < t2.get<0>()) return true; 3746 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3747 return false; 3748 } 3749 }; 3750 3751 struct IJEqual 3752 { 3753 __host__ __device__ 3754 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3755 { 3756 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3757 return true; 3758 } 3759 }; 3760 3761 struct IJDiff 3762 { 3763 __host__ __device__ 3764 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3765 { 3766 return t1 == t2 ? 0 : 1; 3767 } 3768 }; 3769 3770 struct IJSum 3771 { 3772 __host__ __device__ 3773 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3774 { 3775 return t1||t2; 3776 } 3777 }; 3778 3779 #include <thrust/iterator/discard_iterator.h> 3780 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3781 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3782 { 3783 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3784 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3785 THRUSTARRAY *cooPerm_v = NULL; 3786 thrust::device_ptr<const PetscScalar> d_v; 3787 CsrMatrix *matrix; 3788 PetscInt n; 3789 3790 PetscFunctionBegin; 3791 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3792 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3793 if (!cusp->cooPerm) { 3794 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 3795 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 3796 PetscFunctionReturn(0); 3797 } 3798 matrix = (CsrMatrix*)cusp->mat->mat; 3799 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3800 if (!v) { 3801 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3802 goto finalize; 3803 } 3804 n = cusp->cooPerm->size(); 3805 if (isCudaMem(v)) { 3806 d_v = thrust::device_pointer_cast(v); 3807 } else { 3808 cooPerm_v = new THRUSTARRAY(n); 3809 cooPerm_v->assign(v,v+n); 3810 d_v = cooPerm_v->data(); 3811 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 3812 } 3813 PetscCall(PetscLogGpuTimeBegin()); 3814 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3815 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3816 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3817 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3818 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3819 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3820 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3821 */ 3822 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3823 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3824 delete cooPerm_w; 3825 } else { 3826 /* all nonzeros in d_v[] are unique entries */ 3827 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3828 matrix->values->begin())); 3829 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3830 matrix->values->end())); 3831 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3832 } 3833 } else { 3834 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3835 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3836 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3837 } else { 3838 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3839 matrix->values->begin())); 3840 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3841 matrix->values->end())); 3842 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3843 } 3844 } 3845 PetscCall(PetscLogGpuTimeEnd()); 3846 finalize: 3847 delete cooPerm_v; 3848 A->offloadmask = PETSC_OFFLOAD_GPU; 3849 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3850 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3851 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 3852 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 3853 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3854 a->reallocs = 0; 3855 A->info.mallocs += 0; 3856 A->info.nz_unneeded = 0; 3857 A->assembled = A->was_assembled = PETSC_TRUE; 3858 A->num_ass++; 3859 PetscFunctionReturn(0); 3860 } 3861 3862 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3863 { 3864 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3865 3866 PetscFunctionBegin; 3867 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3868 if (!cusp) PetscFunctionReturn(0); 3869 if (destroy) { 3870 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3871 delete cusp->csr2csc_i; 3872 cusp->csr2csc_i = NULL; 3873 } 3874 A->transupdated = PETSC_FALSE; 3875 PetscFunctionReturn(0); 3876 } 3877 3878 #include <thrust/binary_search.h> 3879 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3880 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3881 { 3882 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3883 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3884 PetscInt cooPerm_n, nzr = 0; 3885 3886 PetscFunctionBegin; 3887 PetscCall(PetscLayoutSetUp(A->rmap)); 3888 PetscCall(PetscLayoutSetUp(A->cmap)); 3889 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3890 if (n != cooPerm_n) { 3891 delete cusp->cooPerm; 3892 delete cusp->cooPerm_a; 3893 cusp->cooPerm = NULL; 3894 cusp->cooPerm_a = NULL; 3895 } 3896 if (n) { 3897 THRUSTINTARRAY d_i(n); 3898 THRUSTINTARRAY d_j(n); 3899 THRUSTINTARRAY ii(A->rmap->n); 3900 3901 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3902 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3903 3904 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 3905 d_i.assign(coo_i,coo_i+n); 3906 d_j.assign(coo_j,coo_j+n); 3907 3908 /* Ex. 3909 n = 6 3910 coo_i = [3,3,1,4,1,4] 3911 coo_j = [3,2,2,5,2,6] 3912 */ 3913 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3914 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3915 3916 PetscCall(PetscLogGpuTimeBegin()); 3917 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3918 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3919 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3920 THRUSTINTARRAY w = d_j; 3921 3922 /* 3923 d_i = [1,1,3,3,4,4] 3924 d_j = [2,2,2,3,5,6] 3925 cooPerm = [2,4,1,0,3,5] 3926 */ 3927 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3928 3929 /* 3930 d_i = [1,3,3,4,4,x] 3931 ^ekey 3932 d_j = [2,2,3,5,6,x] 3933 ^nekye 3934 */ 3935 if (nekey == ekey) { /* all entries are unique */ 3936 delete cusp->cooPerm_a; 3937 cusp->cooPerm_a = NULL; 3938 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3939 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3940 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3941 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3942 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3943 w[0] = 0; 3944 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3945 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3946 } 3947 thrust::counting_iterator<PetscInt> search_begin(0); 3948 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3949 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3950 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3951 PetscCall(PetscLogGpuTimeEnd()); 3952 3953 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 3954 a->singlemalloc = PETSC_FALSE; 3955 a->free_a = PETSC_TRUE; 3956 a->free_ij = PETSC_TRUE; 3957 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3958 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3959 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3960 a->nz = a->maxnz = a->i[A->rmap->n]; 3961 a->rmax = 0; 3962 PetscCall(PetscMalloc1(a->nz,&a->a)); 3963 PetscCall(PetscMalloc1(a->nz,&a->j)); 3964 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3965 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 3966 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 3967 for (PetscInt i = 0; i < A->rmap->n; i++) { 3968 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3969 nzr += (PetscInt)!!(nnzr); 3970 a->ilen[i] = a->imax[i] = nnzr; 3971 a->rmax = PetscMax(a->rmax,nnzr); 3972 } 3973 a->nonzerorowcnt = nzr; 3974 A->preallocated = PETSC_TRUE; 3975 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 3976 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 3977 } else { 3978 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 3979 } 3980 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 3981 3982 /* We want to allocate the CUSPARSE struct for matvec now. 3983 The code is so convoluted now that I prefer to copy zeros */ 3984 PetscCall(PetscArrayzero(a->a,a->nz)); 3985 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 3986 A->offloadmask = PETSC_OFFLOAD_CPU; 3987 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3988 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 3989 PetscFunctionReturn(0); 3990 } 3991 3992 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3993 { 3994 Mat_SeqAIJ *seq; 3995 Mat_SeqAIJCUSPARSE *dev; 3996 PetscBool coo_basic = PETSC_TRUE; 3997 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 3998 3999 PetscFunctionBegin; 4000 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4001 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4002 if (coo_i) { 4003 PetscCall(PetscGetMemType(coo_i,&mtype)); 4004 if (PetscMemTypeHost(mtype)) { 4005 for (PetscCount k=0; k<coo_n; k++) { 4006 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4007 } 4008 } 4009 } 4010 4011 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4012 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4013 } else { 4014 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4015 mat->offloadmask = PETSC_OFFLOAD_CPU; 4016 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4017 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4018 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4019 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4020 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4021 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4022 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4023 dev->use_extended_coo = PETSC_TRUE; 4024 } 4025 PetscFunctionReturn(0); 4026 } 4027 4028 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4029 { 4030 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4031 const PetscCount grid_size = gridDim.x * blockDim.x; 4032 for (; i<nnz; i+= grid_size) { 4033 PetscScalar sum = 0.0; 4034 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4035 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4036 } 4037 } 4038 4039 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4040 { 4041 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4042 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4043 PetscCount Annz = seq->nz; 4044 PetscMemType memtype; 4045 const PetscScalar *v1 = v; 4046 PetscScalar *Aa; 4047 4048 PetscFunctionBegin; 4049 if (dev->use_extended_coo) { 4050 PetscCall(PetscGetMemType(v,&memtype)); 4051 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4052 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4053 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4054 } 4055 4056 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4057 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4058 4059 if (Annz) { 4060 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4061 PetscCallCUDA(cudaPeekAtLastError()); 4062 } 4063 4064 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4065 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4066 4067 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4068 } else { 4069 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4070 } 4071 PetscFunctionReturn(0); 4072 } 4073 4074 /*@C 4075 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4076 4077 Not collective 4078 4079 Input Parameters: 4080 + A - the matrix 4081 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4082 4083 Output Parameters: 4084 + ia - the CSR row pointers 4085 - ja - the CSR column indices 4086 4087 Level: developer 4088 4089 Notes: 4090 When compressed is true, the CSR structure does not contain empty rows 4091 4092 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4093 @*/ 4094 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4095 { 4096 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4097 CsrMatrix *csr; 4098 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4099 4100 PetscFunctionBegin; 4101 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4102 if (!i || !j) PetscFunctionReturn(0); 4103 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4104 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4105 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4106 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4107 csr = (CsrMatrix*)cusp->mat->mat; 4108 if (i) { 4109 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4110 if (!cusp->rowoffsets_gpu) { 4111 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4112 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4113 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4114 } 4115 *i = cusp->rowoffsets_gpu->data().get(); 4116 } else *i = csr->row_offsets->data().get(); 4117 } 4118 if (j) *j = csr->column_indices->data().get(); 4119 PetscFunctionReturn(0); 4120 } 4121 4122 /*@C 4123 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4124 4125 Not collective 4126 4127 Input Parameters: 4128 + A - the matrix 4129 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4130 4131 Output Parameters: 4132 + ia - the CSR row pointers 4133 - ja - the CSR column indices 4134 4135 Level: developer 4136 4137 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4138 @*/ 4139 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4140 { 4141 PetscFunctionBegin; 4142 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4143 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4144 if (i) *i = NULL; 4145 if (j) *j = NULL; 4146 PetscFunctionReturn(0); 4147 } 4148 4149 /*@C 4150 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4151 4152 Not Collective 4153 4154 Input Parameter: 4155 . A - a MATSEQAIJCUSPARSE matrix 4156 4157 Output Parameter: 4158 . a - pointer to the device data 4159 4160 Level: developer 4161 4162 Notes: may trigger host-device copies if up-to-date matrix data is on host 4163 4164 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4165 @*/ 4166 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4167 { 4168 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4169 CsrMatrix *csr; 4170 4171 PetscFunctionBegin; 4172 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4173 PetscValidPointer(a,2); 4174 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4175 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4176 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4177 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4178 csr = (CsrMatrix*)cusp->mat->mat; 4179 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4180 *a = csr->values->data().get(); 4181 PetscFunctionReturn(0); 4182 } 4183 4184 /*@C 4185 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4186 4187 Not Collective 4188 4189 Input Parameter: 4190 . A - a MATSEQAIJCUSPARSE matrix 4191 4192 Output Parameter: 4193 . a - pointer to the device data 4194 4195 Level: developer 4196 4197 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4198 @*/ 4199 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4200 { 4201 PetscFunctionBegin; 4202 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4203 PetscValidPointer(a,2); 4204 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4205 *a = NULL; 4206 PetscFunctionReturn(0); 4207 } 4208 4209 /*@C 4210 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4211 4212 Not Collective 4213 4214 Input Parameter: 4215 . A - a MATSEQAIJCUSPARSE matrix 4216 4217 Output Parameter: 4218 . a - pointer to the device data 4219 4220 Level: developer 4221 4222 Notes: may trigger host-device copies if up-to-date matrix data is on host 4223 4224 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4225 @*/ 4226 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4227 { 4228 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4229 CsrMatrix *csr; 4230 4231 PetscFunctionBegin; 4232 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4233 PetscValidPointer(a,2); 4234 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4235 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4236 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4237 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4238 csr = (CsrMatrix*)cusp->mat->mat; 4239 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4240 *a = csr->values->data().get(); 4241 A->offloadmask = PETSC_OFFLOAD_GPU; 4242 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4243 PetscFunctionReturn(0); 4244 } 4245 /*@C 4246 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4247 4248 Not Collective 4249 4250 Input Parameter: 4251 . A - a MATSEQAIJCUSPARSE matrix 4252 4253 Output Parameter: 4254 . a - pointer to the device data 4255 4256 Level: developer 4257 4258 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4259 @*/ 4260 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4261 { 4262 PetscFunctionBegin; 4263 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4264 PetscValidPointer(a,2); 4265 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4266 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4267 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4268 *a = NULL; 4269 PetscFunctionReturn(0); 4270 } 4271 4272 /*@C 4273 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4274 4275 Not Collective 4276 4277 Input Parameter: 4278 . A - a MATSEQAIJCUSPARSE matrix 4279 4280 Output Parameter: 4281 . a - pointer to the device data 4282 4283 Level: developer 4284 4285 Notes: does not trigger host-device copies and flags data validity on the GPU 4286 4287 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4288 @*/ 4289 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4290 { 4291 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4292 CsrMatrix *csr; 4293 4294 PetscFunctionBegin; 4295 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4296 PetscValidPointer(a,2); 4297 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4298 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4299 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4300 csr = (CsrMatrix*)cusp->mat->mat; 4301 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4302 *a = csr->values->data().get(); 4303 A->offloadmask = PETSC_OFFLOAD_GPU; 4304 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4305 PetscFunctionReturn(0); 4306 } 4307 4308 /*@C 4309 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4310 4311 Not Collective 4312 4313 Input Parameter: 4314 . A - a MATSEQAIJCUSPARSE matrix 4315 4316 Output Parameter: 4317 . a - pointer to the device data 4318 4319 Level: developer 4320 4321 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4322 @*/ 4323 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4324 { 4325 PetscFunctionBegin; 4326 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4327 PetscValidPointer(a,2); 4328 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4329 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4330 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4331 *a = NULL; 4332 PetscFunctionReturn(0); 4333 } 4334 4335 struct IJCompare4 4336 { 4337 __host__ __device__ 4338 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4339 { 4340 if (t1.get<0>() < t2.get<0>()) return true; 4341 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4342 return false; 4343 } 4344 }; 4345 4346 struct Shift 4347 { 4348 int _shift; 4349 4350 Shift(int shift) : _shift(shift) {} 4351 __host__ __device__ 4352 inline int operator() (const int &c) 4353 { 4354 return c + _shift; 4355 } 4356 }; 4357 4358 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4359 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4360 { 4361 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4362 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4363 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4364 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4365 PetscInt Annz,Bnnz; 4366 cusparseStatus_t stat; 4367 PetscInt i,m,n,zero = 0; 4368 4369 PetscFunctionBegin; 4370 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4371 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4372 PetscValidPointer(C,4); 4373 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4374 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4375 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4376 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4377 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4378 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4379 if (reuse == MAT_INITIAL_MATRIX) { 4380 m = A->rmap->n; 4381 n = A->cmap->n + B->cmap->n; 4382 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 4383 PetscCall(MatSetSizes(*C,m,n,m,n)); 4384 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4385 c = (Mat_SeqAIJ*)(*C)->data; 4386 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4387 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4388 Ccsr = new CsrMatrix; 4389 Cmat->cprowIndices = NULL; 4390 c->compressedrow.use = PETSC_FALSE; 4391 c->compressedrow.nrows = 0; 4392 c->compressedrow.i = NULL; 4393 c->compressedrow.rindex = NULL; 4394 Ccusp->workVector = NULL; 4395 Ccusp->nrows = m; 4396 Ccusp->mat = Cmat; 4397 Ccusp->mat->mat = Ccsr; 4398 Ccsr->num_rows = m; 4399 Ccsr->num_cols = n; 4400 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4401 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4402 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4403 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 4404 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 4405 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4406 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4407 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4408 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4409 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4410 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4411 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4412 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4413 4414 Acsr = (CsrMatrix*)Acusp->mat->mat; 4415 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4416 Annz = (PetscInt)Acsr->column_indices->size(); 4417 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4418 c->nz = Annz + Bnnz; 4419 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4420 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4421 Ccsr->values = new THRUSTARRAY(c->nz); 4422 Ccsr->num_entries = c->nz; 4423 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4424 if (c->nz) { 4425 auto Acoo = new THRUSTINTARRAY32(Annz); 4426 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4427 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4428 THRUSTINTARRAY32 *Aroff,*Broff; 4429 4430 if (a->compressedrow.use) { /* need full row offset */ 4431 if (!Acusp->rowoffsets_gpu) { 4432 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4433 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4434 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4435 } 4436 Aroff = Acusp->rowoffsets_gpu; 4437 } else Aroff = Acsr->row_offsets; 4438 if (b->compressedrow.use) { /* need full row offset */ 4439 if (!Bcusp->rowoffsets_gpu) { 4440 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4441 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4442 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4443 } 4444 Broff = Bcusp->rowoffsets_gpu; 4445 } else Broff = Bcsr->row_offsets; 4446 PetscCall(PetscLogGpuTimeBegin()); 4447 stat = cusparseXcsr2coo(Acusp->handle, 4448 Aroff->data().get(), 4449 Annz, 4450 m, 4451 Acoo->data().get(), 4452 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4453 stat = cusparseXcsr2coo(Bcusp->handle, 4454 Broff->data().get(), 4455 Bnnz, 4456 m, 4457 Bcoo->data().get(), 4458 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4459 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4460 auto Aperm = thrust::make_constant_iterator(1); 4461 auto Bperm = thrust::make_constant_iterator(0); 4462 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4463 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4464 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4465 #else 4466 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4467 auto Bcib = Bcsr->column_indices->begin(); 4468 auto Bcie = Bcsr->column_indices->end(); 4469 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4470 #endif 4471 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4472 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4473 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4474 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4475 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4476 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4477 auto p1 = Ccusp->cooPerm->begin(); 4478 auto p2 = Ccusp->cooPerm->begin(); 4479 thrust::advance(p2,Annz); 4480 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4481 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4482 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4483 #endif 4484 auto cci = thrust::make_counting_iterator(zero); 4485 auto cce = thrust::make_counting_iterator(c->nz); 4486 #if 0 //Errors on SUMMIT cuda 11.1.0 4487 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4488 #else 4489 auto pred = thrust::identity<int>(); 4490 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4491 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4492 #endif 4493 stat = cusparseXcoo2csr(Ccusp->handle, 4494 Ccoo->data().get(), 4495 c->nz, 4496 m, 4497 Ccsr->row_offsets->data().get(), 4498 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4499 PetscCall(PetscLogGpuTimeEnd()); 4500 delete wPerm; 4501 delete Acoo; 4502 delete Bcoo; 4503 delete Ccoo; 4504 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4505 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4506 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4507 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4508 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4509 #endif 4510 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4511 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4512 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4513 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4514 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4515 CsrMatrix *CcsrT = new CsrMatrix; 4516 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4517 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4518 4519 (*C)->form_explicit_transpose = PETSC_TRUE; 4520 (*C)->transupdated = PETSC_TRUE; 4521 Ccusp->rowoffsets_gpu = NULL; 4522 CmatT->cprowIndices = NULL; 4523 CmatT->mat = CcsrT; 4524 CcsrT->num_rows = n; 4525 CcsrT->num_cols = m; 4526 CcsrT->num_entries = c->nz; 4527 4528 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4529 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4530 CcsrT->values = new THRUSTARRAY(c->nz); 4531 4532 PetscCall(PetscLogGpuTimeBegin()); 4533 auto rT = CcsrT->row_offsets->begin(); 4534 if (AT) { 4535 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4536 thrust::advance(rT,-1); 4537 } 4538 if (BT) { 4539 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4540 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4541 thrust::copy(titb,tite,rT); 4542 } 4543 auto cT = CcsrT->column_indices->begin(); 4544 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4545 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4546 auto vT = CcsrT->values->begin(); 4547 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4548 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4549 PetscCall(PetscLogGpuTimeEnd()); 4550 4551 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4552 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4553 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4554 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 4555 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 4556 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4557 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4558 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4559 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4560 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4561 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4562 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4563 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4564 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4565 #endif 4566 Ccusp->matTranspose = CmatT; 4567 } 4568 } 4569 4570 c->singlemalloc = PETSC_FALSE; 4571 c->free_a = PETSC_TRUE; 4572 c->free_ij = PETSC_TRUE; 4573 PetscCall(PetscMalloc1(m+1,&c->i)); 4574 PetscCall(PetscMalloc1(c->nz,&c->j)); 4575 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4576 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4577 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4578 ii = *Ccsr->row_offsets; 4579 jj = *Ccsr->column_indices; 4580 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4581 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4582 } else { 4583 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4584 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4585 } 4586 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 4587 PetscCall(PetscMalloc1(m,&c->ilen)); 4588 PetscCall(PetscMalloc1(m,&c->imax)); 4589 c->maxnz = c->nz; 4590 c->nonzerorowcnt = 0; 4591 c->rmax = 0; 4592 for (i = 0; i < m; i++) { 4593 const PetscInt nn = c->i[i+1] - c->i[i]; 4594 c->ilen[i] = c->imax[i] = nn; 4595 c->nonzerorowcnt += (PetscInt)!!nn; 4596 c->rmax = PetscMax(c->rmax,nn); 4597 } 4598 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4599 PetscCall(PetscMalloc1(c->nz,&c->a)); 4600 (*C)->nonzerostate++; 4601 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4602 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4603 Ccusp->nonzerostate = (*C)->nonzerostate; 4604 (*C)->preallocated = PETSC_TRUE; 4605 } else { 4606 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4607 c = (Mat_SeqAIJ*)(*C)->data; 4608 if (c->nz) { 4609 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4610 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4611 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4612 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4613 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4614 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4615 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4616 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4617 Acsr = (CsrMatrix*)Acusp->mat->mat; 4618 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4619 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4620 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4621 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4622 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4623 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4624 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4625 auto pmid = Ccusp->cooPerm->begin(); 4626 thrust::advance(pmid,Acsr->num_entries); 4627 PetscCall(PetscLogGpuTimeBegin()); 4628 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4629 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4630 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4631 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4632 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4633 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4634 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4635 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4636 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4637 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4638 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 4639 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4640 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4641 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4642 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4643 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4644 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4645 auto vT = CcsrT->values->begin(); 4646 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4647 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4648 (*C)->transupdated = PETSC_TRUE; 4649 } 4650 PetscCall(PetscLogGpuTimeEnd()); 4651 } 4652 } 4653 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4654 (*C)->assembled = PETSC_TRUE; 4655 (*C)->was_assembled = PETSC_FALSE; 4656 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4657 PetscFunctionReturn(0); 4658 } 4659 4660 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4661 { 4662 bool dmem; 4663 const PetscScalar *av; 4664 4665 PetscFunctionBegin; 4666 dmem = isCudaMem(v); 4667 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4668 if (n && idx) { 4669 THRUSTINTARRAY widx(n); 4670 widx.assign(idx,idx+n); 4671 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4672 4673 THRUSTARRAY *w = NULL; 4674 thrust::device_ptr<PetscScalar> dv; 4675 if (dmem) { 4676 dv = thrust::device_pointer_cast(v); 4677 } else { 4678 w = new THRUSTARRAY(n); 4679 dv = w->data(); 4680 } 4681 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4682 4683 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4684 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4685 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4686 if (w) { 4687 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4688 } 4689 delete w; 4690 } else { 4691 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4692 } 4693 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4694 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4695 PetscFunctionReturn(0); 4696 } 4697