1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 96 { 97 PetscFunctionBegin; 98 *type = MATSOLVERCUSPARSE; 99 PetscFunctionReturn(0); 100 } 101 102 /*MC 103 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 104 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 105 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 106 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 107 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 108 algorithms are not recommended. This class does NOT support direct solver operations. 109 110 Level: beginner 111 112 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 113 M*/ 114 115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 116 { 117 PetscInt n = A->rmap->n; 118 119 PetscFunctionBegin; 120 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 121 PetscCall(MatSetSizes(*B,n,n,n,n)); 122 (*B)->factortype = ftype; 123 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 124 125 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 126 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 127 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 128 if (!A->boundtocpu) { 129 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 130 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 131 } else { 132 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 133 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 134 } 135 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 136 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 137 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 138 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 139 if (!A->boundtocpu) { 140 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 141 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 142 } else { 143 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 144 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 145 } 146 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 147 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 148 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 149 150 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 151 (*B)->canuseordering = PETSC_TRUE; 152 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 153 PetscFunctionReturn(0); 154 } 155 156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 157 { 158 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 159 160 PetscFunctionBegin; 161 switch (op) { 162 case MAT_CUSPARSE_MULT: 163 cusparsestruct->format = format; 164 break; 165 case MAT_CUSPARSE_ALL: 166 cusparsestruct->format = format; 167 break; 168 default: 169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 170 } 171 PetscFunctionReturn(0); 172 } 173 174 /*@ 175 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 176 operation. Only the MatMult operation can use different GPU storage formats 177 for MPIAIJCUSPARSE matrices. 178 Not Collective 179 180 Input Parameters: 181 + A - Matrix of type SEQAIJCUSPARSE 182 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 183 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 184 185 Output Parameter: 186 187 Level: intermediate 188 189 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 190 @*/ 191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 192 { 193 PetscFunctionBegin; 194 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 195 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 196 PetscFunctionReturn(0); 197 } 198 199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 200 { 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202 203 PetscFunctionBegin; 204 cusparsestruct->use_cpu_solve = use_cpu; 205 PetscFunctionReturn(0); 206 } 207 208 /*@ 209 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 210 211 Input Parameters: 212 + A - Matrix of type SEQAIJCUSPARSE 213 - use_cpu - set flag for using the built-in CPU MatSolve 214 215 Output Parameter: 216 217 Notes: 218 The cuSparse LU solver currently computes the factors with the built-in CPU method 219 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 220 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 221 222 Level: intermediate 223 224 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 225 @*/ 226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 227 { 228 PetscFunctionBegin; 229 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 230 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 231 PetscFunctionReturn(0); 232 } 233 234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 235 { 236 PetscFunctionBegin; 237 switch (op) { 238 case MAT_FORM_EXPLICIT_TRANSPOSE: 239 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 240 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 241 A->form_explicit_transpose = flg; 242 break; 243 default: 244 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 245 break; 246 } 247 PetscFunctionReturn(0); 248 } 249 250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 251 252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 253 { 254 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 255 IS isrow = b->row,iscol = b->col; 256 PetscBool row_identity,col_identity; 257 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 258 259 PetscFunctionBegin; 260 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 261 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 262 B->offloadmask = PETSC_OFFLOAD_CPU; 263 /* determine which version of MatSolve needs to be used. */ 264 PetscCall(ISIdentity(isrow,&row_identity)); 265 PetscCall(ISIdentity(iscol,&col_identity)); 266 if (row_identity && col_identity) { 267 if (!cusparsestruct->use_cpu_solve) { 268 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 269 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 270 } 271 B->ops->matsolve = NULL; 272 B->ops->matsolvetranspose = NULL; 273 } else { 274 if (!cusparsestruct->use_cpu_solve) { 275 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 276 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 277 } 278 B->ops->matsolve = NULL; 279 B->ops->matsolvetranspose = NULL; 280 } 281 282 /* get the triangular factors */ 283 if (!cusparsestruct->use_cpu_solve) { 284 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 285 } 286 PetscFunctionReturn(0); 287 } 288 289 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 290 { 291 MatCUSPARSEStorageFormat format; 292 PetscBool flg; 293 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 294 295 PetscFunctionBegin; 296 PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 297 if (A->factortype == MAT_FACTOR_NONE) { 298 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 299 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 300 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 301 302 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 303 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 304 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 305 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 306 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 308 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 309 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 310 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 311 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 312 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 313 #else 314 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 315 #endif 316 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 317 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 318 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 319 320 PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 321 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 322 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 323 #endif 324 } 325 PetscOptionsHeadEnd(); 326 PetscFunctionReturn(0); 327 } 328 329 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 330 { 331 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 332 333 PetscFunctionBegin; 334 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 335 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 336 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 341 { 342 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 343 344 PetscFunctionBegin; 345 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 346 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 347 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 348 PetscFunctionReturn(0); 349 } 350 351 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 352 { 353 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 354 355 PetscFunctionBegin; 356 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 357 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 358 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 359 PetscFunctionReturn(0); 360 } 361 362 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 363 { 364 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 365 366 PetscFunctionBegin; 367 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 368 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 369 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 370 PetscFunctionReturn(0); 371 } 372 373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 374 { 375 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 376 PetscInt n = A->rmap->n; 377 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 378 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 379 const PetscInt *ai = a->i,*aj = a->j,*vi; 380 const MatScalar *aa = a->a,*v; 381 PetscInt *AiLo, *AjLo; 382 PetscInt i,nz, nzLower, offset, rowOffset; 383 384 PetscFunctionBegin; 385 if (!n) PetscFunctionReturn(0); 386 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 387 try { 388 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 389 nzLower=n+ai[n]-ai[1]; 390 if (!loTriFactor) { 391 PetscScalar *AALo; 392 393 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 394 395 /* Allocate Space for the lower triangular matrix */ 396 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 397 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 398 399 /* Fill the lower triangular matrix */ 400 AiLo[0] = (PetscInt) 0; 401 AiLo[n] = nzLower; 402 AjLo[0] = (PetscInt) 0; 403 AALo[0] = (MatScalar) 1.0; 404 v = aa; 405 vi = aj; 406 offset = 1; 407 rowOffset= 1; 408 for (i=1; i<n; i++) { 409 nz = ai[i+1] - ai[i]; 410 /* additional 1 for the term on the diagonal */ 411 AiLo[i] = rowOffset; 412 rowOffset += nz+1; 413 414 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 415 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 416 417 offset += nz; 418 AjLo[offset] = (PetscInt) i; 419 AALo[offset] = (MatScalar) 1.0; 420 offset += 1; 421 422 v += nz; 423 vi += nz; 424 } 425 426 /* allocate space for the triangular factor information */ 427 PetscCall(PetscNew(&loTriFactor)); 428 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 429 /* Create the matrix description */ 430 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 431 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 432 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 433 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 434 #else 435 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 436 #endif 437 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 438 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 439 440 /* set the operation */ 441 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 442 443 /* set the matrix */ 444 loTriFactor->csrMat = new CsrMatrix; 445 loTriFactor->csrMat->num_rows = n; 446 loTriFactor->csrMat->num_cols = n; 447 loTriFactor->csrMat->num_entries = nzLower; 448 449 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 450 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 451 452 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 453 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 454 455 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 456 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 457 458 /* Create the solve analysis information */ 459 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 460 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 462 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 463 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 464 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 465 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 466 &loTriFactor->solveBufferSize)); 467 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 468 #endif 469 470 /* perform the solve analysis */ 471 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 472 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 473 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 474 loTriFactor->csrMat->column_indices->data().get(), 475 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 476 loTriFactor->solveInfo, 477 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 478 #else 479 loTriFactor->solveInfo)); 480 #endif 481 PetscCallCUDA(WaitForCUDA()); 482 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 483 484 /* assign the pointer */ 485 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 486 loTriFactor->AA_h = AALo; 487 PetscCallCUDA(cudaFreeHost(AiLo)); 488 PetscCallCUDA(cudaFreeHost(AjLo)); 489 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 490 } else { /* update values only */ 491 if (!loTriFactor->AA_h) { 492 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 493 } 494 /* Fill the lower triangular matrix */ 495 loTriFactor->AA_h[0] = 1.0; 496 v = aa; 497 vi = aj; 498 offset = 1; 499 for (i=1; i<n; i++) { 500 nz = ai[i+1] - ai[i]; 501 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 502 offset += nz; 503 loTriFactor->AA_h[offset] = 1.0; 504 offset += 1; 505 v += nz; 506 } 507 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 508 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 509 } 510 } catch(char *ex) { 511 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 512 } 513 } 514 PetscFunctionReturn(0); 515 } 516 517 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 518 { 519 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 520 PetscInt n = A->rmap->n; 521 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 522 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 523 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 524 const MatScalar *aa = a->a,*v; 525 PetscInt *AiUp, *AjUp; 526 PetscInt i,nz, nzUpper, offset; 527 528 PetscFunctionBegin; 529 if (!n) PetscFunctionReturn(0); 530 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 531 try { 532 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 533 nzUpper = adiag[0]-adiag[n]; 534 if (!upTriFactor) { 535 PetscScalar *AAUp; 536 537 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 538 539 /* Allocate Space for the upper triangular matrix */ 540 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 541 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 542 543 /* Fill the upper triangular matrix */ 544 AiUp[0]=(PetscInt) 0; 545 AiUp[n]=nzUpper; 546 offset = nzUpper; 547 for (i=n-1; i>=0; i--) { 548 v = aa + adiag[i+1] + 1; 549 vi = aj + adiag[i+1] + 1; 550 551 /* number of elements NOT on the diagonal */ 552 nz = adiag[i] - adiag[i+1]-1; 553 554 /* decrement the offset */ 555 offset -= (nz+1); 556 557 /* first, set the diagonal elements */ 558 AjUp[offset] = (PetscInt) i; 559 AAUp[offset] = (MatScalar)1./v[nz]; 560 AiUp[i] = AiUp[i+1] - (nz+1); 561 562 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 563 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 564 } 565 566 /* allocate space for the triangular factor information */ 567 PetscCall(PetscNew(&upTriFactor)); 568 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 569 570 /* Create the matrix description */ 571 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 572 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 573 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 574 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 575 #else 576 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 577 #endif 578 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 579 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 580 581 /* set the operation */ 582 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 583 584 /* set the matrix */ 585 upTriFactor->csrMat = new CsrMatrix; 586 upTriFactor->csrMat->num_rows = n; 587 upTriFactor->csrMat->num_cols = n; 588 upTriFactor->csrMat->num_entries = nzUpper; 589 590 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 591 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 592 593 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 594 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 595 596 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 597 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 598 599 /* Create the solve analysis information */ 600 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 601 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 602 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 603 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 604 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 605 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 606 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 607 &upTriFactor->solveBufferSize)); 608 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 609 #endif 610 611 /* perform the solve analysis */ 612 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 613 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 614 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 615 upTriFactor->csrMat->column_indices->data().get(), 616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 617 upTriFactor->solveInfo, 618 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 619 #else 620 upTriFactor->solveInfo)); 621 #endif 622 PetscCallCUDA(WaitForCUDA()); 623 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 624 625 /* assign the pointer */ 626 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 627 upTriFactor->AA_h = AAUp; 628 PetscCallCUDA(cudaFreeHost(AiUp)); 629 PetscCallCUDA(cudaFreeHost(AjUp)); 630 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 631 } else { 632 if (!upTriFactor->AA_h) { 633 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 634 } 635 /* Fill the upper triangular matrix */ 636 offset = nzUpper; 637 for (i=n-1; i>=0; i--) { 638 v = aa + adiag[i+1] + 1; 639 640 /* number of elements NOT on the diagonal */ 641 nz = adiag[i] - adiag[i+1]-1; 642 643 /* decrement the offset */ 644 offset -= (nz+1); 645 646 /* first, set the diagonal elements */ 647 upTriFactor->AA_h[offset] = 1./v[nz]; 648 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 649 } 650 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 651 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 652 } 653 } catch(char *ex) { 654 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 655 } 656 } 657 PetscFunctionReturn(0); 658 } 659 660 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 661 { 662 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 663 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 664 IS isrow = a->row,iscol = a->icol; 665 PetscBool row_identity,col_identity; 666 PetscInt n = A->rmap->n; 667 668 PetscFunctionBegin; 669 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 670 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 671 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 672 673 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 674 cusparseTriFactors->nnz=a->nz; 675 676 A->offloadmask = PETSC_OFFLOAD_BOTH; 677 /* lower triangular indices */ 678 PetscCall(ISIdentity(isrow,&row_identity)); 679 if (!row_identity && !cusparseTriFactors->rpermIndices) { 680 const PetscInt *r; 681 682 PetscCall(ISGetIndices(isrow,&r)); 683 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 684 cusparseTriFactors->rpermIndices->assign(r, r+n); 685 PetscCall(ISRestoreIndices(isrow,&r)); 686 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 687 } 688 689 /* upper triangular indices */ 690 PetscCall(ISIdentity(iscol,&col_identity)); 691 if (!col_identity && !cusparseTriFactors->cpermIndices) { 692 const PetscInt *c; 693 694 PetscCall(ISGetIndices(iscol,&c)); 695 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 696 cusparseTriFactors->cpermIndices->assign(c, c+n); 697 PetscCall(ISRestoreIndices(iscol,&c)); 698 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 699 } 700 PetscFunctionReturn(0); 701 } 702 703 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 704 { 705 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 706 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 707 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 708 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 709 PetscInt *AiUp, *AjUp; 710 PetscScalar *AAUp; 711 PetscScalar *AALo; 712 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 713 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 714 const PetscInt *ai = b->i,*aj = b->j,*vj; 715 const MatScalar *aa = b->a,*v; 716 717 PetscFunctionBegin; 718 if (!n) PetscFunctionReturn(0); 719 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 720 try { 721 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 722 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 723 if (!upTriFactor && !loTriFactor) { 724 /* Allocate Space for the upper triangular matrix */ 725 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 726 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 727 728 /* Fill the upper triangular matrix */ 729 AiUp[0]=(PetscInt) 0; 730 AiUp[n]=nzUpper; 731 offset = 0; 732 for (i=0; i<n; i++) { 733 /* set the pointers */ 734 v = aa + ai[i]; 735 vj = aj + ai[i]; 736 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 737 738 /* first, set the diagonal elements */ 739 AjUp[offset] = (PetscInt) i; 740 AAUp[offset] = (MatScalar)1.0/v[nz]; 741 AiUp[i] = offset; 742 AALo[offset] = (MatScalar)1.0/v[nz]; 743 744 offset+=1; 745 if (nz>0) { 746 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 747 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 748 for (j=offset; j<offset+nz; j++) { 749 AAUp[j] = -AAUp[j]; 750 AALo[j] = AAUp[j]/v[nz]; 751 } 752 offset+=nz; 753 } 754 } 755 756 /* allocate space for the triangular factor information */ 757 PetscCall(PetscNew(&upTriFactor)); 758 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 759 760 /* Create the matrix description */ 761 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 762 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 763 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 764 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 765 #else 766 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 767 #endif 768 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 769 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 770 771 /* set the matrix */ 772 upTriFactor->csrMat = new CsrMatrix; 773 upTriFactor->csrMat->num_rows = A->rmap->n; 774 upTriFactor->csrMat->num_cols = A->cmap->n; 775 upTriFactor->csrMat->num_entries = a->nz; 776 777 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 778 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 779 780 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 781 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 782 783 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 784 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 785 786 /* set the operation */ 787 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 788 789 /* Create the solve analysis information */ 790 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 791 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 792 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 793 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 794 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 795 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 796 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 797 &upTriFactor->solveBufferSize)); 798 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 799 #endif 800 801 /* perform the solve analysis */ 802 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 803 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 804 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 805 upTriFactor->csrMat->column_indices->data().get(), 806 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 807 upTriFactor->solveInfo, 808 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 809 #else 810 upTriFactor->solveInfo)); 811 #endif 812 PetscCallCUDA(WaitForCUDA()); 813 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 814 815 /* assign the pointer */ 816 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 817 818 /* allocate space for the triangular factor information */ 819 PetscCall(PetscNew(&loTriFactor)); 820 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 821 822 /* Create the matrix description */ 823 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 824 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 825 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 826 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 827 #else 828 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 829 #endif 830 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 831 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 832 833 /* set the operation */ 834 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 835 836 /* set the matrix */ 837 loTriFactor->csrMat = new CsrMatrix; 838 loTriFactor->csrMat->num_rows = A->rmap->n; 839 loTriFactor->csrMat->num_cols = A->cmap->n; 840 loTriFactor->csrMat->num_entries = a->nz; 841 842 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 843 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 844 845 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 846 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 847 848 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 849 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 850 851 /* Create the solve analysis information */ 852 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 853 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 854 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 855 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 856 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 857 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 858 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 859 &loTriFactor->solveBufferSize)); 860 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 861 #endif 862 863 /* perform the solve analysis */ 864 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 865 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 866 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 867 loTriFactor->csrMat->column_indices->data().get(), 868 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 869 loTriFactor->solveInfo, 870 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 871 #else 872 loTriFactor->solveInfo)); 873 #endif 874 PetscCallCUDA(WaitForCUDA()); 875 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 876 877 /* assign the pointer */ 878 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 879 880 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 881 PetscCallCUDA(cudaFreeHost(AiUp)); 882 PetscCallCUDA(cudaFreeHost(AjUp)); 883 } else { 884 /* Fill the upper triangular matrix */ 885 offset = 0; 886 for (i=0; i<n; i++) { 887 /* set the pointers */ 888 v = aa + ai[i]; 889 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 890 891 /* first, set the diagonal elements */ 892 AAUp[offset] = 1.0/v[nz]; 893 AALo[offset] = 1.0/v[nz]; 894 895 offset+=1; 896 if (nz>0) { 897 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 898 for (j=offset; j<offset+nz; j++) { 899 AAUp[j] = -AAUp[j]; 900 AALo[j] = AAUp[j]/v[nz]; 901 } 902 offset+=nz; 903 } 904 } 905 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 906 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 908 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 909 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 910 } 911 PetscCallCUDA(cudaFreeHost(AAUp)); 912 PetscCallCUDA(cudaFreeHost(AALo)); 913 } catch(char *ex) { 914 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 915 } 916 } 917 PetscFunctionReturn(0); 918 } 919 920 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 921 { 922 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 923 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 924 IS ip = a->row; 925 PetscBool perm_identity; 926 PetscInt n = A->rmap->n; 927 928 PetscFunctionBegin; 929 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 930 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 931 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 932 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 933 934 A->offloadmask = PETSC_OFFLOAD_BOTH; 935 936 /* lower triangular indices */ 937 PetscCall(ISIdentity(ip,&perm_identity)); 938 if (!perm_identity) { 939 IS iip; 940 const PetscInt *irip,*rip; 941 942 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 943 PetscCall(ISGetIndices(iip,&irip)); 944 PetscCall(ISGetIndices(ip,&rip)); 945 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 946 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 947 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 948 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 949 PetscCall(ISRestoreIndices(iip,&irip)); 950 PetscCall(ISDestroy(&iip)); 951 PetscCall(ISRestoreIndices(ip,&rip)); 952 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 953 } 954 PetscFunctionReturn(0); 955 } 956 957 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 958 { 959 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 960 IS ip = b->row; 961 PetscBool perm_identity; 962 963 PetscFunctionBegin; 964 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 965 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 966 B->offloadmask = PETSC_OFFLOAD_CPU; 967 /* determine which version of MatSolve needs to be used. */ 968 PetscCall(ISIdentity(ip,&perm_identity)); 969 if (perm_identity) { 970 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 971 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 972 B->ops->matsolve = NULL; 973 B->ops->matsolvetranspose = NULL; 974 } else { 975 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 976 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 977 B->ops->matsolve = NULL; 978 B->ops->matsolvetranspose = NULL; 979 } 980 981 /* get the triangular factors */ 982 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 983 PetscFunctionReturn(0); 984 } 985 986 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 987 { 988 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 989 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 990 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 991 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 992 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 993 cusparseIndexBase_t indexBase; 994 cusparseMatrixType_t matrixType; 995 cusparseFillMode_t fillMode; 996 cusparseDiagType_t diagType; 997 998 PetscFunctionBegin; 999 /* allocate space for the transpose of the lower triangular factor */ 1000 PetscCall(PetscNew(&loTriFactorT)); 1001 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1002 1003 /* set the matrix descriptors of the lower triangular factor */ 1004 matrixType = cusparseGetMatType(loTriFactor->descr); 1005 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1006 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1007 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1008 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1009 1010 /* Create the matrix description */ 1011 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1012 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1013 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1014 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1015 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1016 1017 /* set the operation */ 1018 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1019 1020 /* allocate GPU space for the CSC of the lower triangular factor*/ 1021 loTriFactorT->csrMat = new CsrMatrix; 1022 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1023 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1024 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1025 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1026 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1027 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1028 1029 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1031 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1032 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1033 loTriFactor->csrMat->values->data().get(), 1034 loTriFactor->csrMat->row_offsets->data().get(), 1035 loTriFactor->csrMat->column_indices->data().get(), 1036 loTriFactorT->csrMat->values->data().get(), 1037 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1038 CUSPARSE_ACTION_NUMERIC,indexBase, 1039 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1040 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1041 #endif 1042 1043 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1044 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1045 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1046 loTriFactor->csrMat->values->data().get(), 1047 loTriFactor->csrMat->row_offsets->data().get(), 1048 loTriFactor->csrMat->column_indices->data().get(), 1049 loTriFactorT->csrMat->values->data().get(), 1050 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC, indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1054 #else 1055 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1056 CUSPARSE_ACTION_NUMERIC, indexBase)); 1057 #endif 1058 PetscCallCUDA(WaitForCUDA()); 1059 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1060 1061 /* Create the solve analysis information */ 1062 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1063 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 1064 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1065 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1066 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1067 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1069 &loTriFactorT->solveBufferSize)); 1070 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1071 #endif 1072 1073 /* perform the solve analysis */ 1074 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1075 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1076 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1077 loTriFactorT->csrMat->column_indices->data().get(), 1078 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1079 loTriFactorT->solveInfo, 1080 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1081 #else 1082 loTriFactorT->solveInfo)); 1083 #endif 1084 PetscCallCUDA(WaitForCUDA()); 1085 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1086 1087 /* assign the pointer */ 1088 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1089 1090 /*********************************************/ 1091 /* Now the Transpose of the Upper Tri Factor */ 1092 /*********************************************/ 1093 1094 /* allocate space for the transpose of the upper triangular factor */ 1095 PetscCall(PetscNew(&upTriFactorT)); 1096 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1097 1098 /* set the matrix descriptors of the upper triangular factor */ 1099 matrixType = cusparseGetMatType(upTriFactor->descr); 1100 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1101 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1102 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1103 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1104 1105 /* Create the matrix description */ 1106 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1107 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1108 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1109 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1110 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1111 1112 /* set the operation */ 1113 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1114 1115 /* allocate GPU space for the CSC of the upper triangular factor*/ 1116 upTriFactorT->csrMat = new CsrMatrix; 1117 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1118 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1119 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1120 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1121 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1122 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1123 1124 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1125 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1126 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1127 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1128 upTriFactor->csrMat->values->data().get(), 1129 upTriFactor->csrMat->row_offsets->data().get(), 1130 upTriFactor->csrMat->column_indices->data().get(), 1131 upTriFactorT->csrMat->values->data().get(), 1132 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1133 CUSPARSE_ACTION_NUMERIC,indexBase, 1134 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1135 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1136 #endif 1137 1138 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1139 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1140 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141 upTriFactor->csrMat->values->data().get(), 1142 upTriFactor->csrMat->row_offsets->data().get(), 1143 upTriFactor->csrMat->column_indices->data().get(), 1144 upTriFactorT->csrMat->values->data().get(), 1145 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1146 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1147 CUSPARSE_ACTION_NUMERIC, indexBase, 1148 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1149 #else 1150 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1151 CUSPARSE_ACTION_NUMERIC, indexBase)); 1152 #endif 1153 1154 PetscCallCUDA(WaitForCUDA()); 1155 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1156 1157 /* Create the solve analysis information */ 1158 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1159 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 1160 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1161 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1162 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1163 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1165 &upTriFactorT->solveBufferSize)); 1166 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1167 #endif 1168 1169 /* perform the solve analysis */ 1170 /* christ, would it have killed you to put this stuff in a function????????? */ 1171 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1172 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1173 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1174 upTriFactorT->csrMat->column_indices->data().get(), 1175 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1176 upTriFactorT->solveInfo, 1177 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1178 #else 1179 upTriFactorT->solveInfo)); 1180 #endif 1181 1182 PetscCallCUDA(WaitForCUDA()); 1183 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1184 1185 /* assign the pointer */ 1186 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1187 PetscFunctionReturn(0); 1188 } 1189 1190 struct PetscScalarToPetscInt 1191 { 1192 __host__ __device__ 1193 PetscInt operator()(PetscScalar s) 1194 { 1195 return (PetscInt)PetscRealPart(s); 1196 } 1197 }; 1198 1199 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1200 { 1201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1202 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1203 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1204 cusparseStatus_t stat; 1205 cusparseIndexBase_t indexBase; 1206 1207 PetscFunctionBegin; 1208 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1209 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1210 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1211 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1212 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1213 if (A->transupdated) PetscFunctionReturn(0); 1214 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1215 PetscCall(PetscLogGpuTimeBegin()); 1216 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1217 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1218 } 1219 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1220 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1221 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1222 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1223 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1224 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1225 1226 /* set alpha and beta */ 1227 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1228 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1229 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1230 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1231 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1232 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1233 1234 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1235 CsrMatrix *matrixT = new CsrMatrix; 1236 matstructT->mat = matrixT; 1237 matrixT->num_rows = A->cmap->n; 1238 matrixT->num_cols = A->rmap->n; 1239 matrixT->num_entries = a->nz; 1240 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1241 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1242 matrixT->values = new THRUSTARRAY(a->nz); 1243 1244 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1245 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1246 1247 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1248 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1249 stat = cusparseCreateCsr(&matstructT->matDescr, 1250 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1251 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1252 matrixT->values->data().get(), 1253 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1254 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1255 #else 1256 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1257 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1258 1259 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1260 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1261 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1262 */ 1263 if (matrixT->num_entries) { 1264 stat = cusparseCreateCsr(&matstructT->matDescr, 1265 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1266 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1267 matrixT->values->data().get(), 1268 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1269 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1270 1271 } else { 1272 matstructT->matDescr = NULL; 1273 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1274 } 1275 #endif 1276 #endif 1277 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1278 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1279 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1280 #else 1281 CsrMatrix *temp = new CsrMatrix; 1282 CsrMatrix *tempT = new CsrMatrix; 1283 /* First convert HYB to CSR */ 1284 temp->num_rows = A->rmap->n; 1285 temp->num_cols = A->cmap->n; 1286 temp->num_entries = a->nz; 1287 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1288 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1289 temp->values = new THRUSTARRAY(a->nz); 1290 1291 stat = cusparse_hyb2csr(cusparsestruct->handle, 1292 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1293 temp->values->data().get(), 1294 temp->row_offsets->data().get(), 1295 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1296 1297 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1298 tempT->num_rows = A->rmap->n; 1299 tempT->num_cols = A->cmap->n; 1300 tempT->num_entries = a->nz; 1301 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1302 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1303 tempT->values = new THRUSTARRAY(a->nz); 1304 1305 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1306 temp->num_cols, temp->num_entries, 1307 temp->values->data().get(), 1308 temp->row_offsets->data().get(), 1309 temp->column_indices->data().get(), 1310 tempT->values->data().get(), 1311 tempT->column_indices->data().get(), 1312 tempT->row_offsets->data().get(), 1313 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1314 1315 /* Last, convert CSC to HYB */ 1316 cusparseHybMat_t hybMat; 1317 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1318 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1319 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1320 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1321 matstructT->descr, tempT->values->data().get(), 1322 tempT->row_offsets->data().get(), 1323 tempT->column_indices->data().get(), 1324 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1325 1326 /* assign the pointer */ 1327 matstructT->mat = hybMat; 1328 A->transupdated = PETSC_TRUE; 1329 /* delete temporaries */ 1330 if (tempT) { 1331 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1332 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1333 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1334 delete (CsrMatrix*) tempT; 1335 } 1336 if (temp) { 1337 if (temp->values) delete (THRUSTARRAY*) temp->values; 1338 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1339 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1340 delete (CsrMatrix*) temp; 1341 } 1342 #endif 1343 } 1344 } 1345 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1346 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1347 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1348 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1349 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1350 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1351 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1352 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1353 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1354 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1355 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1356 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1357 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1358 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1359 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1360 } 1361 if (!cusparsestruct->csr2csc_i) { 1362 THRUSTARRAY csr2csc_a(matrix->num_entries); 1363 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1364 1365 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1366 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1367 void *csr2cscBuffer; 1368 size_t csr2cscBufferSize; 1369 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1370 A->cmap->n, matrix->num_entries, 1371 matrix->values->data().get(), 1372 cusparsestruct->rowoffsets_gpu->data().get(), 1373 matrix->column_indices->data().get(), 1374 matrixT->values->data().get(), 1375 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1376 CUSPARSE_ACTION_NUMERIC,indexBase, 1377 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1378 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1379 #endif 1380 1381 if (matrix->num_entries) { 1382 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1383 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1384 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1385 1386 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1387 should be filled with indexBase. So I just take a shortcut here. 1388 */ 1389 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1390 A->cmap->n,matrix->num_entries, 1391 csr2csc_a.data().get(), 1392 cusparsestruct->rowoffsets_gpu->data().get(), 1393 matrix->column_indices->data().get(), 1394 matrixT->values->data().get(), 1395 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1396 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1397 CUSPARSE_ACTION_NUMERIC,indexBase, 1398 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1399 #else 1400 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1401 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1402 #endif 1403 } else { 1404 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1405 } 1406 1407 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1408 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1409 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1410 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1411 #endif 1412 } 1413 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1414 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1415 matrixT->values->begin())); 1416 } 1417 PetscCall(PetscLogGpuTimeEnd()); 1418 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1419 /* the compressed row indices is not used for matTranspose */ 1420 matstructT->cprowIndices = NULL; 1421 /* assign the pointer */ 1422 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1423 A->transupdated = PETSC_TRUE; 1424 PetscFunctionReturn(0); 1425 } 1426 1427 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1428 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1429 { 1430 PetscInt n = xx->map->n; 1431 const PetscScalar *barray; 1432 PetscScalar *xarray; 1433 thrust::device_ptr<const PetscScalar> bGPU; 1434 thrust::device_ptr<PetscScalar> xGPU; 1435 cusparseStatus_t stat; 1436 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1437 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1438 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1439 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1440 1441 PetscFunctionBegin; 1442 /* Analyze the matrix and create the transpose ... on the fly */ 1443 if (!loTriFactorT && !upTriFactorT) { 1444 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1445 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1446 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1447 } 1448 1449 /* Get the GPU pointers */ 1450 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1451 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1452 xGPU = thrust::device_pointer_cast(xarray); 1453 bGPU = thrust::device_pointer_cast(barray); 1454 1455 PetscCall(PetscLogGpuTimeBegin()); 1456 /* First, reorder with the row permutation */ 1457 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1458 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1459 xGPU); 1460 1461 /* First, solve U */ 1462 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1463 upTriFactorT->csrMat->num_rows, 1464 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1465 upTriFactorT->csrMat->num_entries, 1466 #endif 1467 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1468 upTriFactorT->csrMat->values->data().get(), 1469 upTriFactorT->csrMat->row_offsets->data().get(), 1470 upTriFactorT->csrMat->column_indices->data().get(), 1471 upTriFactorT->solveInfo, 1472 xarray, 1473 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1474 tempGPU->data().get(), 1475 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1476 #else 1477 tempGPU->data().get());PetscCallCUSPARSE(stat); 1478 #endif 1479 1480 /* Then, solve L */ 1481 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1482 loTriFactorT->csrMat->num_rows, 1483 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1484 loTriFactorT->csrMat->num_entries, 1485 #endif 1486 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1487 loTriFactorT->csrMat->values->data().get(), 1488 loTriFactorT->csrMat->row_offsets->data().get(), 1489 loTriFactorT->csrMat->column_indices->data().get(), 1490 loTriFactorT->solveInfo, 1491 tempGPU->data().get(), 1492 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1493 xarray, 1494 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1495 #else 1496 xarray);PetscCallCUSPARSE(stat); 1497 #endif 1498 1499 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1500 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1501 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1502 tempGPU->begin()); 1503 1504 /* Copy the temporary to the full solution. */ 1505 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1506 1507 /* restore */ 1508 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1509 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1510 PetscCall(PetscLogGpuTimeEnd()); 1511 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1512 PetscFunctionReturn(0); 1513 } 1514 1515 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1516 { 1517 const PetscScalar *barray; 1518 PetscScalar *xarray; 1519 cusparseStatus_t stat; 1520 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1521 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1522 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1523 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1524 1525 PetscFunctionBegin; 1526 /* Analyze the matrix and create the transpose ... on the fly */ 1527 if (!loTriFactorT && !upTriFactorT) { 1528 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1529 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1530 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1531 } 1532 1533 /* Get the GPU pointers */ 1534 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1535 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1536 1537 PetscCall(PetscLogGpuTimeBegin()); 1538 /* First, solve U */ 1539 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1540 upTriFactorT->csrMat->num_rows, 1541 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1542 upTriFactorT->csrMat->num_entries, 1543 #endif 1544 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1545 upTriFactorT->csrMat->values->data().get(), 1546 upTriFactorT->csrMat->row_offsets->data().get(), 1547 upTriFactorT->csrMat->column_indices->data().get(), 1548 upTriFactorT->solveInfo, 1549 barray, 1550 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1551 tempGPU->data().get(), 1552 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1553 #else 1554 tempGPU->data().get());PetscCallCUSPARSE(stat); 1555 #endif 1556 1557 /* Then, solve L */ 1558 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1559 loTriFactorT->csrMat->num_rows, 1560 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1561 loTriFactorT->csrMat->num_entries, 1562 #endif 1563 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1564 loTriFactorT->csrMat->values->data().get(), 1565 loTriFactorT->csrMat->row_offsets->data().get(), 1566 loTriFactorT->csrMat->column_indices->data().get(), 1567 loTriFactorT->solveInfo, 1568 tempGPU->data().get(), 1569 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1570 xarray, 1571 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1572 #else 1573 xarray);PetscCallCUSPARSE(stat); 1574 #endif 1575 1576 /* restore */ 1577 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1578 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1579 PetscCall(PetscLogGpuTimeEnd()); 1580 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1581 PetscFunctionReturn(0); 1582 } 1583 1584 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1585 { 1586 const PetscScalar *barray; 1587 PetscScalar *xarray; 1588 thrust::device_ptr<const PetscScalar> bGPU; 1589 thrust::device_ptr<PetscScalar> xGPU; 1590 cusparseStatus_t stat; 1591 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1594 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1595 1596 PetscFunctionBegin; 1597 1598 /* Get the GPU pointers */ 1599 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1600 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1601 xGPU = thrust::device_pointer_cast(xarray); 1602 bGPU = thrust::device_pointer_cast(barray); 1603 1604 PetscCall(PetscLogGpuTimeBegin()); 1605 /* First, reorder with the row permutation */ 1606 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1607 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1608 tempGPU->begin()); 1609 1610 /* Next, solve L */ 1611 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1612 loTriFactor->csrMat->num_rows, 1613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1614 loTriFactor->csrMat->num_entries, 1615 #endif 1616 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1617 loTriFactor->csrMat->values->data().get(), 1618 loTriFactor->csrMat->row_offsets->data().get(), 1619 loTriFactor->csrMat->column_indices->data().get(), 1620 loTriFactor->solveInfo, 1621 tempGPU->data().get(), 1622 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1623 xarray, 1624 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1625 #else 1626 xarray);PetscCallCUSPARSE(stat); 1627 #endif 1628 1629 /* Then, solve U */ 1630 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1631 upTriFactor->csrMat->num_rows, 1632 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1633 upTriFactor->csrMat->num_entries, 1634 #endif 1635 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1636 upTriFactor->csrMat->values->data().get(), 1637 upTriFactor->csrMat->row_offsets->data().get(), 1638 upTriFactor->csrMat->column_indices->data().get(), 1639 upTriFactor->solveInfo,xarray, 1640 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1641 tempGPU->data().get(), 1642 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1643 #else 1644 tempGPU->data().get());PetscCallCUSPARSE(stat); 1645 #endif 1646 1647 /* Last, reorder with the column permutation */ 1648 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1649 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1650 xGPU); 1651 1652 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1653 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1654 PetscCall(PetscLogGpuTimeEnd()); 1655 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1660 { 1661 const PetscScalar *barray; 1662 PetscScalar *xarray; 1663 cusparseStatus_t stat; 1664 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1665 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1666 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1667 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1668 1669 PetscFunctionBegin; 1670 /* Get the GPU pointers */ 1671 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1672 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1673 1674 PetscCall(PetscLogGpuTimeBegin()); 1675 /* First, solve L */ 1676 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1677 loTriFactor->csrMat->num_rows, 1678 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1679 loTriFactor->csrMat->num_entries, 1680 #endif 1681 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1682 loTriFactor->csrMat->values->data().get(), 1683 loTriFactor->csrMat->row_offsets->data().get(), 1684 loTriFactor->csrMat->column_indices->data().get(), 1685 loTriFactor->solveInfo, 1686 barray, 1687 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1688 tempGPU->data().get(), 1689 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1690 #else 1691 tempGPU->data().get());PetscCallCUSPARSE(stat); 1692 #endif 1693 1694 /* Next, solve U */ 1695 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1696 upTriFactor->csrMat->num_rows, 1697 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698 upTriFactor->csrMat->num_entries, 1699 #endif 1700 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1701 upTriFactor->csrMat->values->data().get(), 1702 upTriFactor->csrMat->row_offsets->data().get(), 1703 upTriFactor->csrMat->column_indices->data().get(), 1704 upTriFactor->solveInfo, 1705 tempGPU->data().get(), 1706 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1707 xarray, 1708 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1709 #else 1710 xarray);PetscCallCUSPARSE(stat); 1711 #endif 1712 1713 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1714 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1715 PetscCall(PetscLogGpuTimeEnd()); 1716 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1717 PetscFunctionReturn(0); 1718 } 1719 1720 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1721 { 1722 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1723 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1724 1725 PetscFunctionBegin; 1726 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1727 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1728 1729 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1730 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 1731 PetscCallCUDA(WaitForCUDA()); 1732 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 1733 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1734 A->offloadmask = PETSC_OFFLOAD_BOTH; 1735 } 1736 PetscFunctionReturn(0); 1737 } 1738 1739 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1740 { 1741 PetscFunctionBegin; 1742 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1743 *array = ((Mat_SeqAIJ*)A->data)->a; 1744 PetscFunctionReturn(0); 1745 } 1746 1747 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1748 { 1749 PetscFunctionBegin; 1750 A->offloadmask = PETSC_OFFLOAD_CPU; 1751 *array = NULL; 1752 PetscFunctionReturn(0); 1753 } 1754 1755 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1756 { 1757 PetscFunctionBegin; 1758 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1759 *array = ((Mat_SeqAIJ*)A->data)->a; 1760 PetscFunctionReturn(0); 1761 } 1762 1763 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1764 { 1765 PetscFunctionBegin; 1766 *array = NULL; 1767 PetscFunctionReturn(0); 1768 } 1769 1770 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1771 { 1772 PetscFunctionBegin; 1773 *array = ((Mat_SeqAIJ*)A->data)->a; 1774 PetscFunctionReturn(0); 1775 } 1776 1777 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1778 { 1779 PetscFunctionBegin; 1780 A->offloadmask = PETSC_OFFLOAD_CPU; 1781 *array = NULL; 1782 PetscFunctionReturn(0); 1783 } 1784 1785 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 1786 { 1787 Mat_SeqAIJCUSPARSE *cusp; 1788 CsrMatrix *matrix; 1789 1790 PetscFunctionBegin; 1791 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1792 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1793 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 1794 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 1795 matrix = (CsrMatrix*)cusp->mat->mat; 1796 1797 if (i) { 1798 #if !defined(PETSC_USE_64BIT_INDICES) 1799 *i = matrix->row_offsets->data().get(); 1800 #else 1801 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1802 #endif 1803 } 1804 if (j) { 1805 #if !defined(PETSC_USE_64BIT_INDICES) 1806 *j = matrix->column_indices->data().get(); 1807 #else 1808 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1809 #endif 1810 } 1811 if (a) *a = matrix->values->data().get(); 1812 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 1813 PetscFunctionReturn(0); 1814 } 1815 1816 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1817 { 1818 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1819 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1820 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1821 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1822 cusparseStatus_t stat; 1823 PetscBool both = PETSC_TRUE; 1824 1825 PetscFunctionBegin; 1826 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1827 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1828 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1829 CsrMatrix *matrix; 1830 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1831 1832 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1833 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1834 matrix->values->assign(a->a, a->a+a->nz); 1835 PetscCallCUDA(WaitForCUDA()); 1836 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 1837 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1838 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 1839 } else { 1840 PetscInt nnz; 1841 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1842 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 1843 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1844 delete cusparsestruct->workVector; 1845 delete cusparsestruct->rowoffsets_gpu; 1846 cusparsestruct->workVector = NULL; 1847 cusparsestruct->rowoffsets_gpu = NULL; 1848 try { 1849 if (a->compressedrow.use) { 1850 m = a->compressedrow.nrows; 1851 ii = a->compressedrow.i; 1852 ridx = a->compressedrow.rindex; 1853 } else { 1854 m = A->rmap->n; 1855 ii = a->i; 1856 ridx = NULL; 1857 } 1858 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1859 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1860 else nnz = a->nz; 1861 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1862 1863 /* create cusparse matrix */ 1864 cusparsestruct->nrows = m; 1865 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1866 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 1867 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 1868 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1869 1870 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 1871 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 1872 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 1873 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1874 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1875 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1876 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1877 1878 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1879 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1880 /* set the matrix */ 1881 CsrMatrix *mat= new CsrMatrix; 1882 mat->num_rows = m; 1883 mat->num_cols = A->cmap->n; 1884 mat->num_entries = nnz; 1885 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1886 mat->row_offsets->assign(ii, ii + m+1); 1887 1888 mat->column_indices = new THRUSTINTARRAY32(nnz); 1889 mat->column_indices->assign(a->j, a->j+nnz); 1890 1891 mat->values = new THRUSTARRAY(nnz); 1892 if (a->a) mat->values->assign(a->a, a->a+nnz); 1893 1894 /* assign the pointer */ 1895 matstruct->mat = mat; 1896 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1897 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1898 stat = cusparseCreateCsr(&matstruct->matDescr, 1899 mat->num_rows, mat->num_cols, mat->num_entries, 1900 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1901 mat->values->data().get(), 1902 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1903 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1904 } 1905 #endif 1906 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1907 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1908 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1909 #else 1910 CsrMatrix *mat= new CsrMatrix; 1911 mat->num_rows = m; 1912 mat->num_cols = A->cmap->n; 1913 mat->num_entries = nnz; 1914 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1915 mat->row_offsets->assign(ii, ii + m+1); 1916 1917 mat->column_indices = new THRUSTINTARRAY32(nnz); 1918 mat->column_indices->assign(a->j, a->j+nnz); 1919 1920 mat->values = new THRUSTARRAY(nnz); 1921 if (a->a) mat->values->assign(a->a, a->a+nnz); 1922 1923 cusparseHybMat_t hybMat; 1924 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1925 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1926 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1927 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1928 matstruct->descr, mat->values->data().get(), 1929 mat->row_offsets->data().get(), 1930 mat->column_indices->data().get(), 1931 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1932 /* assign the pointer */ 1933 matstruct->mat = hybMat; 1934 1935 if (mat) { 1936 if (mat->values) delete (THRUSTARRAY*)mat->values; 1937 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1938 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1939 delete (CsrMatrix*)mat; 1940 } 1941 #endif 1942 } 1943 1944 /* assign the compressed row indices */ 1945 if (a->compressedrow.use) { 1946 cusparsestruct->workVector = new THRUSTARRAY(m); 1947 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1948 matstruct->cprowIndices->assign(ridx,ridx+m); 1949 tmp = m; 1950 } else { 1951 cusparsestruct->workVector = NULL; 1952 matstruct->cprowIndices = NULL; 1953 tmp = 0; 1954 } 1955 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1956 1957 /* assign the pointer */ 1958 cusparsestruct->mat = matstruct; 1959 } catch(char *ex) { 1960 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1961 } 1962 PetscCallCUDA(WaitForCUDA()); 1963 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1964 cusparsestruct->nonzerostate = A->nonzerostate; 1965 } 1966 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1967 } 1968 PetscFunctionReturn(0); 1969 } 1970 1971 struct VecCUDAPlusEquals 1972 { 1973 template <typename Tuple> 1974 __host__ __device__ 1975 void operator()(Tuple t) 1976 { 1977 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1978 } 1979 }; 1980 1981 struct VecCUDAEquals 1982 { 1983 template <typename Tuple> 1984 __host__ __device__ 1985 void operator()(Tuple t) 1986 { 1987 thrust::get<1>(t) = thrust::get<0>(t); 1988 } 1989 }; 1990 1991 struct VecCUDAEqualsReverse 1992 { 1993 template <typename Tuple> 1994 __host__ __device__ 1995 void operator()(Tuple t) 1996 { 1997 thrust::get<0>(t) = thrust::get<1>(t); 1998 } 1999 }; 2000 2001 struct MatMatCusparse { 2002 PetscBool cisdense; 2003 PetscScalar *Bt; 2004 Mat X; 2005 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2006 PetscLogDouble flops; 2007 CsrMatrix *Bcsr; 2008 2009 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2010 cusparseSpMatDescr_t matSpBDescr; 2011 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2012 cusparseDnMatDescr_t matBDescr; 2013 cusparseDnMatDescr_t matCDescr; 2014 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2015 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2016 void *dBuffer4; 2017 void *dBuffer5; 2018 #endif 2019 size_t mmBufferSize; 2020 void *mmBuffer; 2021 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2022 cusparseSpGEMMDescr_t spgemmDesc; 2023 #endif 2024 }; 2025 2026 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2027 { 2028 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2029 2030 PetscFunctionBegin; 2031 PetscCallCUDA(cudaFree(mmdata->Bt)); 2032 delete mmdata->Bcsr; 2033 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2034 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2035 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2036 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2037 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2038 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2039 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2040 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2041 #endif 2042 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2043 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2044 #endif 2045 PetscCall(MatDestroy(&mmdata->X)); 2046 PetscCall(PetscFree(data)); 2047 PetscFunctionReturn(0); 2048 } 2049 2050 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2051 2052 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2053 { 2054 Mat_Product *product = C->product; 2055 Mat A,B; 2056 PetscInt m,n,blda,clda; 2057 PetscBool flg,biscuda; 2058 Mat_SeqAIJCUSPARSE *cusp; 2059 cusparseStatus_t stat; 2060 cusparseOperation_t opA; 2061 const PetscScalar *barray; 2062 PetscScalar *carray; 2063 MatMatCusparse *mmdata; 2064 Mat_SeqAIJCUSPARSEMultStruct *mat; 2065 CsrMatrix *csrmat; 2066 2067 PetscFunctionBegin; 2068 MatCheckProduct(C,1); 2069 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2070 mmdata = (MatMatCusparse*)product->data; 2071 A = product->A; 2072 B = product->B; 2073 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2074 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2075 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2076 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2077 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2078 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2079 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2080 switch (product->type) { 2081 case MATPRODUCT_AB: 2082 case MATPRODUCT_PtAP: 2083 mat = cusp->mat; 2084 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2085 m = A->rmap->n; 2086 n = B->cmap->n; 2087 break; 2088 case MATPRODUCT_AtB: 2089 if (!A->form_explicit_transpose) { 2090 mat = cusp->mat; 2091 opA = CUSPARSE_OPERATION_TRANSPOSE; 2092 } else { 2093 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2094 mat = cusp->matTranspose; 2095 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2096 } 2097 m = A->cmap->n; 2098 n = B->cmap->n; 2099 break; 2100 case MATPRODUCT_ABt: 2101 case MATPRODUCT_RARt: 2102 mat = cusp->mat; 2103 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2104 m = A->rmap->n; 2105 n = B->rmap->n; 2106 break; 2107 default: 2108 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2109 } 2110 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2111 csrmat = (CsrMatrix*)mat->mat; 2112 /* if the user passed a CPU matrix, copy the data to the GPU */ 2113 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2114 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2115 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2116 2117 PetscCall(MatDenseGetLDA(B,&blda)); 2118 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2119 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2120 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2121 } else { 2122 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2123 PetscCall(MatDenseGetLDA(C,&clda)); 2124 } 2125 2126 PetscCall(PetscLogGpuTimeBegin()); 2127 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2128 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2129 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2130 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2131 size_t mmBufferSize; 2132 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2133 if (!mmdata->matBDescr) { 2134 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2135 mmdata->Blda = blda; 2136 } 2137 2138 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2139 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2140 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2141 mmdata->Clda = clda; 2142 } 2143 2144 if (!mat->matDescr) { 2145 stat = cusparseCreateCsr(&mat->matDescr, 2146 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2147 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2148 csrmat->values->data().get(), 2149 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2150 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2151 } 2152 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2153 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2154 mmdata->matCDescr,cusparse_scalartype, 2155 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2156 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2157 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2158 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2159 mmdata->mmBufferSize = mmBufferSize; 2160 } 2161 mmdata->initialized = PETSC_TRUE; 2162 } else { 2163 /* to be safe, always update pointers of the mats */ 2164 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2165 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2166 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2167 } 2168 2169 /* do cusparseSpMM, which supports transpose on B */ 2170 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2171 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2172 mmdata->matCDescr,cusparse_scalartype, 2173 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2174 #else 2175 PetscInt k; 2176 /* cusparseXcsrmm does not support transpose on B */ 2177 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2178 cublasHandle_t cublasv2handle; 2179 cublasStatus_t cerr; 2180 2181 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2182 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2183 B->cmap->n,B->rmap->n, 2184 &PETSC_CUSPARSE_ONE ,barray,blda, 2185 &PETSC_CUSPARSE_ZERO,barray,blda, 2186 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2187 blda = B->cmap->n; 2188 k = B->cmap->n; 2189 } else { 2190 k = B->rmap->n; 2191 } 2192 2193 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2194 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2195 csrmat->num_entries,mat->alpha_one,mat->descr, 2196 csrmat->values->data().get(), 2197 csrmat->row_offsets->data().get(), 2198 csrmat->column_indices->data().get(), 2199 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2200 carray,clda);PetscCallCUSPARSE(stat); 2201 #endif 2202 PetscCall(PetscLogGpuTimeEnd()); 2203 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2204 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2205 if (product->type == MATPRODUCT_RARt) { 2206 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2207 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2208 } else if (product->type == MATPRODUCT_PtAP) { 2209 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2210 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2211 } else { 2212 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2213 } 2214 if (mmdata->cisdense) { 2215 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2216 } 2217 if (!biscuda) { 2218 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2219 } 2220 PetscFunctionReturn(0); 2221 } 2222 2223 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2224 { 2225 Mat_Product *product = C->product; 2226 Mat A,B; 2227 PetscInt m,n; 2228 PetscBool cisdense,flg; 2229 MatMatCusparse *mmdata; 2230 Mat_SeqAIJCUSPARSE *cusp; 2231 2232 PetscFunctionBegin; 2233 MatCheckProduct(C,1); 2234 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2235 A = product->A; 2236 B = product->B; 2237 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2238 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2239 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2240 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2241 switch (product->type) { 2242 case MATPRODUCT_AB: 2243 m = A->rmap->n; 2244 n = B->cmap->n; 2245 break; 2246 case MATPRODUCT_AtB: 2247 m = A->cmap->n; 2248 n = B->cmap->n; 2249 break; 2250 case MATPRODUCT_ABt: 2251 m = A->rmap->n; 2252 n = B->rmap->n; 2253 break; 2254 case MATPRODUCT_PtAP: 2255 m = B->cmap->n; 2256 n = B->cmap->n; 2257 break; 2258 case MATPRODUCT_RARt: 2259 m = B->rmap->n; 2260 n = B->rmap->n; 2261 break; 2262 default: 2263 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2264 } 2265 PetscCall(MatSetSizes(C,m,n,m,n)); 2266 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2267 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 2268 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2269 2270 /* product data */ 2271 PetscCall(PetscNew(&mmdata)); 2272 mmdata->cisdense = cisdense; 2273 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2274 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2275 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2276 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2277 } 2278 #endif 2279 /* for these products we need intermediate storage */ 2280 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2281 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 2282 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2283 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2284 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2285 } else { 2286 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2287 } 2288 } 2289 C->product->data = mmdata; 2290 C->product->destroy = MatDestroy_MatMatCusparse; 2291 2292 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2293 PetscFunctionReturn(0); 2294 } 2295 2296 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2297 { 2298 Mat_Product *product = C->product; 2299 Mat A,B; 2300 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2301 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2302 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2303 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2304 PetscBool flg; 2305 cusparseStatus_t stat; 2306 MatProductType ptype; 2307 MatMatCusparse *mmdata; 2308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2309 cusparseSpMatDescr_t BmatSpDescr; 2310 #endif 2311 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2312 2313 PetscFunctionBegin; 2314 MatCheckProduct(C,1); 2315 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2316 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 2317 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2318 mmdata = (MatMatCusparse*)C->product->data; 2319 A = product->A; 2320 B = product->B; 2321 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2322 mmdata->reusesym = PETSC_FALSE; 2323 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2324 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2325 Cmat = Ccusp->mat; 2326 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2327 Ccsr = (CsrMatrix*)Cmat->mat; 2328 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2329 goto finalize; 2330 } 2331 if (!c->nz) goto finalize; 2332 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2333 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2334 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2335 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2336 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2337 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2338 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2339 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2340 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2341 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2342 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2343 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2344 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2345 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2346 2347 ptype = product->type; 2348 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2349 ptype = MATPRODUCT_AB; 2350 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2351 } 2352 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2353 ptype = MATPRODUCT_AB; 2354 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2355 } 2356 switch (ptype) { 2357 case MATPRODUCT_AB: 2358 Amat = Acusp->mat; 2359 Bmat = Bcusp->mat; 2360 break; 2361 case MATPRODUCT_AtB: 2362 Amat = Acusp->matTranspose; 2363 Bmat = Bcusp->mat; 2364 break; 2365 case MATPRODUCT_ABt: 2366 Amat = Acusp->mat; 2367 Bmat = Bcusp->matTranspose; 2368 break; 2369 default: 2370 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2371 } 2372 Cmat = Ccusp->mat; 2373 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2374 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2375 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2376 Acsr = (CsrMatrix*)Amat->mat; 2377 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2378 Ccsr = (CsrMatrix*)Cmat->mat; 2379 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2380 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2381 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2382 PetscCall(PetscLogGpuTimeBegin()); 2383 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2384 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2385 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2386 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2387 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2388 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2389 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2390 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2391 #else 2392 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2393 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2394 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2395 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2396 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2397 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2398 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2399 #endif 2400 #else 2401 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2402 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2403 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2404 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2405 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2406 #endif 2407 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2408 PetscCallCUDA(WaitForCUDA()); 2409 PetscCall(PetscLogGpuTimeEnd()); 2410 C->offloadmask = PETSC_OFFLOAD_GPU; 2411 finalize: 2412 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2413 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 2414 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 2415 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2416 c->reallocs = 0; 2417 C->info.mallocs += 0; 2418 C->info.nz_unneeded = 0; 2419 C->assembled = C->was_assembled = PETSC_TRUE; 2420 C->num_ass++; 2421 PetscFunctionReturn(0); 2422 } 2423 2424 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2425 { 2426 Mat_Product *product = C->product; 2427 Mat A,B; 2428 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2429 Mat_SeqAIJ *a,*b,*c; 2430 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2431 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2432 PetscInt i,j,m,n,k; 2433 PetscBool flg; 2434 cusparseStatus_t stat; 2435 MatProductType ptype; 2436 MatMatCusparse *mmdata; 2437 PetscLogDouble flops; 2438 PetscBool biscompressed,ciscompressed; 2439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2440 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2441 cusparseSpMatDescr_t BmatSpDescr; 2442 #else 2443 int cnz; 2444 #endif 2445 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2446 2447 PetscFunctionBegin; 2448 MatCheckProduct(C,1); 2449 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2450 A = product->A; 2451 B = product->B; 2452 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2453 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2454 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2455 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2456 a = (Mat_SeqAIJ*)A->data; 2457 b = (Mat_SeqAIJ*)B->data; 2458 /* product data */ 2459 PetscCall(PetscNew(&mmdata)); 2460 C->product->data = mmdata; 2461 C->product->destroy = MatDestroy_MatMatCusparse; 2462 2463 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2464 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2465 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2466 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2467 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2468 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2469 2470 ptype = product->type; 2471 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2472 ptype = MATPRODUCT_AB; 2473 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2474 } 2475 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2476 ptype = MATPRODUCT_AB; 2477 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2478 } 2479 biscompressed = PETSC_FALSE; 2480 ciscompressed = PETSC_FALSE; 2481 switch (ptype) { 2482 case MATPRODUCT_AB: 2483 m = A->rmap->n; 2484 n = B->cmap->n; 2485 k = A->cmap->n; 2486 Amat = Acusp->mat; 2487 Bmat = Bcusp->mat; 2488 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2489 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2490 break; 2491 case MATPRODUCT_AtB: 2492 m = A->cmap->n; 2493 n = B->cmap->n; 2494 k = A->rmap->n; 2495 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2496 Amat = Acusp->matTranspose; 2497 Bmat = Bcusp->mat; 2498 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2499 break; 2500 case MATPRODUCT_ABt: 2501 m = A->rmap->n; 2502 n = B->rmap->n; 2503 k = A->cmap->n; 2504 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2505 Amat = Acusp->mat; 2506 Bmat = Bcusp->matTranspose; 2507 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2508 break; 2509 default: 2510 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2511 } 2512 2513 /* create cusparse matrix */ 2514 PetscCall(MatSetSizes(C,m,n,m,n)); 2515 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2516 c = (Mat_SeqAIJ*)C->data; 2517 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2518 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2519 Ccsr = new CsrMatrix; 2520 2521 c->compressedrow.use = ciscompressed; 2522 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2523 c->compressedrow.nrows = a->compressedrow.nrows; 2524 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 2525 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2526 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2527 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2528 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2529 } else { 2530 c->compressedrow.nrows = 0; 2531 c->compressedrow.i = NULL; 2532 c->compressedrow.rindex = NULL; 2533 Ccusp->workVector = NULL; 2534 Cmat->cprowIndices = NULL; 2535 } 2536 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2537 Ccusp->mat = Cmat; 2538 Ccusp->mat->mat = Ccsr; 2539 Ccsr->num_rows = Ccusp->nrows; 2540 Ccsr->num_cols = n; 2541 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2542 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2543 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2544 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2545 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 2546 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 2547 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2548 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2549 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2550 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2551 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2552 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2553 c->nz = 0; 2554 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2555 Ccsr->values = new THRUSTARRAY(c->nz); 2556 goto finalizesym; 2557 } 2558 2559 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2560 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2561 Acsr = (CsrMatrix*)Amat->mat; 2562 if (!biscompressed) { 2563 Bcsr = (CsrMatrix*)Bmat->mat; 2564 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2565 BmatSpDescr = Bmat->matDescr; 2566 #endif 2567 } else { /* we need to use row offsets for the full matrix */ 2568 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2569 Bcsr = new CsrMatrix; 2570 Bcsr->num_rows = B->rmap->n; 2571 Bcsr->num_cols = cBcsr->num_cols; 2572 Bcsr->num_entries = cBcsr->num_entries; 2573 Bcsr->column_indices = cBcsr->column_indices; 2574 Bcsr->values = cBcsr->values; 2575 if (!Bcusp->rowoffsets_gpu) { 2576 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2577 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2578 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2579 } 2580 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2581 mmdata->Bcsr = Bcsr; 2582 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2583 if (Bcsr->num_rows && Bcsr->num_cols) { 2584 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2585 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2586 Bcsr->values->data().get(), 2587 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2588 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2589 } 2590 BmatSpDescr = mmdata->matSpBDescr; 2591 #endif 2592 } 2593 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2594 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2595 /* precompute flops count */ 2596 if (ptype == MATPRODUCT_AB) { 2597 for (i=0, flops = 0; i<A->rmap->n; i++) { 2598 const PetscInt st = a->i[i]; 2599 const PetscInt en = a->i[i+1]; 2600 for (j=st; j<en; j++) { 2601 const PetscInt brow = a->j[j]; 2602 flops += 2.*(b->i[brow+1] - b->i[brow]); 2603 } 2604 } 2605 } else if (ptype == MATPRODUCT_AtB) { 2606 for (i=0, flops = 0; i<A->rmap->n; i++) { 2607 const PetscInt anzi = a->i[i+1] - a->i[i]; 2608 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2609 flops += (2.*anzi)*bnzi; 2610 } 2611 } else { /* TODO */ 2612 flops = 0.; 2613 } 2614 2615 mmdata->flops = flops; 2616 PetscCall(PetscLogGpuTimeBegin()); 2617 2618 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2619 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2620 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2621 NULL, NULL, NULL, 2622 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2623 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2624 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2625 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2626 { 2627 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2628 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2629 */ 2630 void* dBuffer1 = NULL; 2631 void* dBuffer2 = NULL; 2632 void* dBuffer3 = NULL; 2633 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2634 size_t bufferSize1 = 0; 2635 size_t bufferSize2 = 0; 2636 size_t bufferSize3 = 0; 2637 size_t bufferSize4 = 0; 2638 size_t bufferSize5 = 0; 2639 2640 /*----------------------------------------------------------------------*/ 2641 /* ask bufferSize1 bytes for external memory */ 2642 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2643 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2644 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 2645 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2646 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2647 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2648 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2649 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2650 2651 /*----------------------------------------------------------------------*/ 2652 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2653 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2654 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 2655 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 2656 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 2657 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2658 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2659 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2660 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 2661 PetscCallCUDA(cudaFree(dBuffer1)); 2662 PetscCallCUDA(cudaFree(dBuffer2)); 2663 2664 /*----------------------------------------------------------------------*/ 2665 /* get matrix C non-zero entries C_nnz1 */ 2666 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2667 c->nz = (PetscInt) C_nnz1; 2668 /* allocate matrix C */ 2669 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2670 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2671 /* update matC with the new pointers */ 2672 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2673 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2674 2675 /*----------------------------------------------------------------------*/ 2676 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2677 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2678 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 2679 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2680 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2681 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2682 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 2683 PetscCallCUDA(cudaFree(dBuffer3)); 2684 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2685 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2686 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2687 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2688 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2689 } 2690 #else 2691 size_t bufSize2; 2692 /* ask bufferSize bytes for external memory */ 2693 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2694 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2695 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2696 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 2697 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2698 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2699 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2700 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2701 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2702 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2703 /* ask bufferSize again bytes for external memory */ 2704 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2705 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2706 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2707 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2708 /* The CUSPARSE documentation is not clear, nor the API 2709 We need both buffers to perform the operations properly! 2710 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2711 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2712 is stored in the descriptor! What a messy API... */ 2713 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2714 /* compute the intermediate product of A * B */ 2715 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2716 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2717 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2718 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2719 /* get matrix C non-zero entries C_nnz1 */ 2720 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2721 c->nz = (PetscInt) C_nnz1; 2722 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2723 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2724 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2725 Ccsr->values = new THRUSTARRAY(c->nz); 2726 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2727 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2728 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2729 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2730 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2731 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2732 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2733 #else 2734 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2735 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2736 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2737 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2738 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2739 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2740 c->nz = cnz; 2741 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2742 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2743 Ccsr->values = new THRUSTARRAY(c->nz); 2744 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2745 2746 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2747 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2748 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2749 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2750 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2751 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2752 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2753 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2754 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2755 #endif 2756 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2757 PetscCall(PetscLogGpuTimeEnd()); 2758 finalizesym: 2759 c->singlemalloc = PETSC_FALSE; 2760 c->free_a = PETSC_TRUE; 2761 c->free_ij = PETSC_TRUE; 2762 PetscCall(PetscMalloc1(m+1,&c->i)); 2763 PetscCall(PetscMalloc1(c->nz,&c->j)); 2764 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2765 PetscInt *d_i = c->i; 2766 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2767 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2768 ii = *Ccsr->row_offsets; 2769 jj = *Ccsr->column_indices; 2770 if (ciscompressed) d_i = c->compressedrow.i; 2771 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2772 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2773 } else { 2774 PetscInt *d_i = c->i; 2775 if (ciscompressed) d_i = c->compressedrow.i; 2776 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2777 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2778 } 2779 if (ciscompressed) { /* need to expand host row offsets */ 2780 PetscInt r = 0; 2781 c->i[0] = 0; 2782 for (k = 0; k < c->compressedrow.nrows; k++) { 2783 const PetscInt next = c->compressedrow.rindex[k]; 2784 const PetscInt old = c->compressedrow.i[k]; 2785 for (; r < next; r++) c->i[r+1] = old; 2786 } 2787 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2788 } 2789 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 2790 PetscCall(PetscMalloc1(m,&c->ilen)); 2791 PetscCall(PetscMalloc1(m,&c->imax)); 2792 c->maxnz = c->nz; 2793 c->nonzerorowcnt = 0; 2794 c->rmax = 0; 2795 for (k = 0; k < m; k++) { 2796 const PetscInt nn = c->i[k+1] - c->i[k]; 2797 c->ilen[k] = c->imax[k] = nn; 2798 c->nonzerorowcnt += (PetscInt)!!nn; 2799 c->rmax = PetscMax(c->rmax,nn); 2800 } 2801 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 2802 PetscCall(PetscMalloc1(c->nz,&c->a)); 2803 Ccsr->num_entries = c->nz; 2804 2805 C->nonzerostate++; 2806 PetscCall(PetscLayoutSetUp(C->rmap)); 2807 PetscCall(PetscLayoutSetUp(C->cmap)); 2808 Ccusp->nonzerostate = C->nonzerostate; 2809 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2810 C->preallocated = PETSC_TRUE; 2811 C->assembled = PETSC_FALSE; 2812 C->was_assembled = PETSC_FALSE; 2813 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2814 mmdata->reusesym = PETSC_TRUE; 2815 C->offloadmask = PETSC_OFFLOAD_GPU; 2816 } 2817 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2818 PetscFunctionReturn(0); 2819 } 2820 2821 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2822 2823 /* handles sparse or dense B */ 2824 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2825 { 2826 Mat_Product *product = mat->product; 2827 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2828 2829 PetscFunctionBegin; 2830 MatCheckProduct(mat,1); 2831 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2832 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2833 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2834 } 2835 if (product->type == MATPRODUCT_ABC) { 2836 Ciscusp = PETSC_FALSE; 2837 if (!product->C->boundtocpu) { 2838 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2839 } 2840 } 2841 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2842 PetscBool usecpu = PETSC_FALSE; 2843 switch (product->type) { 2844 case MATPRODUCT_AB: 2845 if (product->api_user) { 2846 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 2847 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2848 PetscOptionsEnd(); 2849 } else { 2850 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 2851 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2852 PetscOptionsEnd(); 2853 } 2854 break; 2855 case MATPRODUCT_AtB: 2856 if (product->api_user) { 2857 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 2858 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2859 PetscOptionsEnd(); 2860 } else { 2861 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 2862 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2863 PetscOptionsEnd(); 2864 } 2865 break; 2866 case MATPRODUCT_PtAP: 2867 if (product->api_user) { 2868 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 2869 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2870 PetscOptionsEnd(); 2871 } else { 2872 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 2873 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2874 PetscOptionsEnd(); 2875 } 2876 break; 2877 case MATPRODUCT_RARt: 2878 if (product->api_user) { 2879 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 2880 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2881 PetscOptionsEnd(); 2882 } else { 2883 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 2884 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2885 PetscOptionsEnd(); 2886 } 2887 break; 2888 case MATPRODUCT_ABC: 2889 if (product->api_user) { 2890 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 2891 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2892 PetscOptionsEnd(); 2893 } else { 2894 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 2895 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2896 PetscOptionsEnd(); 2897 } 2898 break; 2899 default: 2900 break; 2901 } 2902 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2903 } 2904 /* dispatch */ 2905 if (isdense) { 2906 switch (product->type) { 2907 case MATPRODUCT_AB: 2908 case MATPRODUCT_AtB: 2909 case MATPRODUCT_ABt: 2910 case MATPRODUCT_PtAP: 2911 case MATPRODUCT_RARt: 2912 if (product->A->boundtocpu) { 2913 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2914 } else { 2915 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2916 } 2917 break; 2918 case MATPRODUCT_ABC: 2919 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2920 break; 2921 default: 2922 break; 2923 } 2924 } else if (Biscusp && Ciscusp) { 2925 switch (product->type) { 2926 case MATPRODUCT_AB: 2927 case MATPRODUCT_AtB: 2928 case MATPRODUCT_ABt: 2929 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2930 break; 2931 case MATPRODUCT_PtAP: 2932 case MATPRODUCT_RARt: 2933 case MATPRODUCT_ABC: 2934 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2935 break; 2936 default: 2937 break; 2938 } 2939 } else { /* fallback for AIJ */ 2940 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2941 } 2942 PetscFunctionReturn(0); 2943 } 2944 2945 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2946 { 2947 PetscFunctionBegin; 2948 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2949 PetscFunctionReturn(0); 2950 } 2951 2952 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2953 { 2954 PetscFunctionBegin; 2955 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2956 PetscFunctionReturn(0); 2957 } 2958 2959 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2960 { 2961 PetscFunctionBegin; 2962 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2963 PetscFunctionReturn(0); 2964 } 2965 2966 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2967 { 2968 PetscFunctionBegin; 2969 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 2970 PetscFunctionReturn(0); 2971 } 2972 2973 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2974 { 2975 PetscFunctionBegin; 2976 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2977 PetscFunctionReturn(0); 2978 } 2979 2980 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2981 { 2982 int i = blockIdx.x*blockDim.x + threadIdx.x; 2983 if (i < n) y[idx[i]] += x[i]; 2984 } 2985 2986 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2987 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2988 { 2989 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2990 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2991 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2992 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2993 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2994 PetscBool compressed; 2995 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2996 PetscInt nx,ny; 2997 #endif 2998 2999 PetscFunctionBegin; 3000 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3001 if (!a->nz) { 3002 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3003 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3004 PetscFunctionReturn(0); 3005 } 3006 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3007 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3008 if (!trans) { 3009 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3010 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3011 } else { 3012 if (herm || !A->form_explicit_transpose) { 3013 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3014 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3015 } else { 3016 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3017 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3018 } 3019 } 3020 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3021 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3022 3023 try { 3024 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3025 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3026 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3027 3028 PetscCall(PetscLogGpuTimeBegin()); 3029 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3030 /* z = A x + beta y. 3031 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3032 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3033 */ 3034 xptr = xarray; 3035 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3036 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3038 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3039 allocated to accommodate different uses. So we get the length info directly from mat. 3040 */ 3041 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3042 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3043 nx = mat->num_cols; 3044 ny = mat->num_rows; 3045 } 3046 #endif 3047 } else { 3048 /* z = A^T x + beta y 3049 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3050 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3051 */ 3052 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3053 dptr = zarray; 3054 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3055 if (compressed) { /* Scatter x to work vector */ 3056 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3057 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3058 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3059 VecCUDAEqualsReverse()); 3060 } 3061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3062 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3063 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3064 nx = mat->num_rows; 3065 ny = mat->num_cols; 3066 } 3067 #endif 3068 } 3069 3070 /* csr_spmv does y = alpha op(A) x + beta y */ 3071 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3072 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3073 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3074 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3075 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3076 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3077 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3078 matstruct->matDescr, 3079 matstruct->cuSpMV[opA].vecXDescr, beta, 3080 matstruct->cuSpMV[opA].vecYDescr, 3081 cusparse_scalartype, 3082 cusparsestruct->spmvAlg, 3083 &matstruct->cuSpMV[opA].spmvBufferSize)); 3084 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3085 3086 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3087 } else { 3088 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3089 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3090 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3091 } 3092 3093 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3094 matstruct->alpha_one, 3095 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3096 matstruct->cuSpMV[opA].vecXDescr, 3097 beta, 3098 matstruct->cuSpMV[opA].vecYDescr, 3099 cusparse_scalartype, 3100 cusparsestruct->spmvAlg, 3101 matstruct->cuSpMV[opA].spmvBuffer)); 3102 #else 3103 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3104 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3105 mat->num_rows, mat->num_cols, 3106 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3107 mat->values->data().get(), mat->row_offsets->data().get(), 3108 mat->column_indices->data().get(), xptr, beta, 3109 dptr)); 3110 #endif 3111 } else { 3112 if (cusparsestruct->nrows) { 3113 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3114 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3115 #else 3116 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3117 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3118 matstruct->alpha_one, matstruct->descr, hybMat, 3119 xptr, beta, 3120 dptr)); 3121 #endif 3122 } 3123 } 3124 PetscCall(PetscLogGpuTimeEnd()); 3125 3126 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3127 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3128 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3129 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3130 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3131 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3132 } 3133 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3134 PetscCall(VecSet_SeqCUDA(zz,0)); 3135 } 3136 3137 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3138 if (compressed) { 3139 PetscCall(PetscLogGpuTimeBegin()); 3140 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3141 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3142 prevent that. So I just add a ScatterAdd kernel. 3143 */ 3144 #if 0 3145 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3146 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3147 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3148 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3149 VecCUDAPlusEquals()); 3150 #else 3151 PetscInt n = matstruct->cprowIndices->size(); 3152 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3153 #endif 3154 PetscCall(PetscLogGpuTimeEnd()); 3155 } 3156 } else { 3157 if (yy && yy != zz) { 3158 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3159 } 3160 } 3161 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3162 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3163 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3164 } catch(char *ex) { 3165 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3166 } 3167 if (yy) { 3168 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3169 } else { 3170 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3171 } 3172 PetscFunctionReturn(0); 3173 } 3174 3175 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3176 { 3177 PetscFunctionBegin; 3178 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3179 PetscFunctionReturn(0); 3180 } 3181 3182 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3183 { 3184 PetscObjectState onnz = A->nonzerostate; 3185 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3186 3187 PetscFunctionBegin; 3188 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3189 if (onnz != A->nonzerostate && cusp->deviceMat) { 3190 3191 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3192 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3193 cusp->deviceMat = NULL; 3194 } 3195 PetscFunctionReturn(0); 3196 } 3197 3198 /* --------------------------------------------------------------------------------*/ 3199 /*@ 3200 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3201 (the default parallel PETSc format). This matrix will ultimately pushed down 3202 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3203 assembly performance the user should preallocate the matrix storage by setting 3204 the parameter nz (or the array nnz). By setting these parameters accurately, 3205 performance during matrix assembly can be increased by more than a factor of 50. 3206 3207 Collective 3208 3209 Input Parameters: 3210 + comm - MPI communicator, set to PETSC_COMM_SELF 3211 . m - number of rows 3212 . n - number of columns 3213 . nz - number of nonzeros per row (same for all rows) 3214 - nnz - array containing the number of nonzeros in the various rows 3215 (possibly different for each row) or NULL 3216 3217 Output Parameter: 3218 . A - the matrix 3219 3220 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3221 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3222 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3223 3224 Notes: 3225 If nnz is given then nz is ignored 3226 3227 The AIJ format (also called the Yale sparse matrix format or 3228 compressed row storage), is fully compatible with standard Fortran 77 3229 storage. That is, the stored row and column indices can begin at 3230 either one (as in Fortran) or zero. See the users' manual for details. 3231 3232 Specify the preallocated storage with either nz or nnz (not both). 3233 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3234 allocation. For large problems you MUST preallocate memory or you 3235 will get TERRIBLE performance, see the users' manual chapter on matrices. 3236 3237 By default, this format uses inodes (identical nodes) when possible, to 3238 improve numerical efficiency of matrix-vector products and solves. We 3239 search for consecutive rows with the same nonzero structure, thereby 3240 reusing matrix information to achieve increased efficiency. 3241 3242 Level: intermediate 3243 3244 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3245 @*/ 3246 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3247 { 3248 PetscFunctionBegin; 3249 PetscCall(MatCreate(comm,A)); 3250 PetscCall(MatSetSizes(*A,m,n,m,n)); 3251 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 3252 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3253 PetscFunctionReturn(0); 3254 } 3255 3256 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3257 { 3258 PetscFunctionBegin; 3259 if (A->factortype == MAT_FACTOR_NONE) { 3260 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 3261 } else { 3262 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3263 } 3264 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3265 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 3266 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 3267 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3268 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3269 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3270 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 3271 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3272 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3273 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 3274 PetscCall(MatDestroy_SeqAIJ(A)); 3275 PetscFunctionReturn(0); 3276 } 3277 3278 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3279 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3280 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3281 { 3282 PetscFunctionBegin; 3283 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 3284 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 3285 PetscFunctionReturn(0); 3286 } 3287 3288 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3289 { 3290 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3291 Mat_SeqAIJCUSPARSE *cy; 3292 Mat_SeqAIJCUSPARSE *cx; 3293 PetscScalar *ay; 3294 const PetscScalar *ax; 3295 CsrMatrix *csry,*csrx; 3296 3297 PetscFunctionBegin; 3298 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3299 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3300 if (X->ops->axpy != Y->ops->axpy) { 3301 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3302 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3303 PetscFunctionReturn(0); 3304 } 3305 /* if we are here, it means both matrices are bound to GPU */ 3306 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3307 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3308 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3309 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3310 csry = (CsrMatrix*)cy->mat->mat; 3311 csrx = (CsrMatrix*)cx->mat->mat; 3312 /* see if we can turn this into a cublas axpy */ 3313 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3314 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3315 if (eq) { 3316 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3317 } 3318 if (eq) str = SAME_NONZERO_PATTERN; 3319 } 3320 /* spgeam is buggy with one column */ 3321 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3322 3323 if (str == SUBSET_NONZERO_PATTERN) { 3324 PetscScalar b = 1.0; 3325 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3326 size_t bufferSize; 3327 void *buffer; 3328 #endif 3329 3330 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3331 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3332 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3333 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3334 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3335 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3336 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3337 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 3338 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 3339 PetscCall(PetscLogGpuTimeBegin()); 3340 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3341 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3342 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3343 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 3344 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3345 PetscCall(PetscLogGpuTimeEnd()); 3346 PetscCallCUDA(cudaFree(buffer)); 3347 #else 3348 PetscCall(PetscLogGpuTimeBegin()); 3349 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3350 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3351 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3352 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 3353 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3354 PetscCall(PetscLogGpuTimeEnd()); 3355 #endif 3356 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3357 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3358 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3359 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3360 } else if (str == SAME_NONZERO_PATTERN) { 3361 cublasHandle_t cublasv2handle; 3362 PetscBLASInt one = 1, bnz = 1; 3363 3364 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3365 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3366 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3367 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 3368 PetscCall(PetscLogGpuTimeBegin()); 3369 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 3370 PetscCall(PetscLogGpuFlops(2.0*bnz)); 3371 PetscCall(PetscLogGpuTimeEnd()); 3372 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3373 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3374 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3375 } else { 3376 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3377 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3378 } 3379 PetscFunctionReturn(0); 3380 } 3381 3382 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3383 { 3384 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3385 PetscScalar *ay; 3386 cublasHandle_t cublasv2handle; 3387 PetscBLASInt one = 1, bnz = 1; 3388 3389 PetscFunctionBegin; 3390 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3391 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3392 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 3393 PetscCall(PetscLogGpuTimeBegin()); 3394 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 3395 PetscCall(PetscLogGpuFlops(bnz)); 3396 PetscCall(PetscLogGpuTimeEnd()); 3397 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3398 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3399 PetscFunctionReturn(0); 3400 } 3401 3402 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3403 { 3404 PetscBool both = PETSC_FALSE; 3405 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3406 3407 PetscFunctionBegin; 3408 if (A->factortype == MAT_FACTOR_NONE) { 3409 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3410 if (spptr->mat) { 3411 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3412 if (matrix->values) { 3413 both = PETSC_TRUE; 3414 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3415 } 3416 } 3417 if (spptr->matTranspose) { 3418 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3419 if (matrix->values) { 3420 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3421 } 3422 } 3423 } 3424 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 3425 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3426 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3427 else A->offloadmask = PETSC_OFFLOAD_CPU; 3428 PetscFunctionReturn(0); 3429 } 3430 3431 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3432 { 3433 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3434 3435 PetscFunctionBegin; 3436 if (A->factortype != MAT_FACTOR_NONE) { 3437 A->boundtocpu = flg; 3438 PetscFunctionReturn(0); 3439 } 3440 if (flg) { 3441 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3442 3443 A->ops->scale = MatScale_SeqAIJ; 3444 A->ops->axpy = MatAXPY_SeqAIJ; 3445 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3446 A->ops->mult = MatMult_SeqAIJ; 3447 A->ops->multadd = MatMultAdd_SeqAIJ; 3448 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3449 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3450 A->ops->multhermitiantranspose = NULL; 3451 A->ops->multhermitiantransposeadd = NULL; 3452 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3453 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 3454 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3455 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3456 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3457 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3458 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3459 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ)); 3460 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3461 } else { 3462 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3463 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3464 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3465 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3466 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3467 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3468 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3469 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3470 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3471 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3472 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3473 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3474 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3475 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3476 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3477 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3478 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3479 3480 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3481 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3482 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3483 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3484 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 3485 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3486 } 3487 A->boundtocpu = flg; 3488 if (flg && a->inode.size) { 3489 a->inode.use = PETSC_TRUE; 3490 } else { 3491 a->inode.use = PETSC_FALSE; 3492 } 3493 PetscFunctionReturn(0); 3494 } 3495 3496 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3497 { 3498 Mat B; 3499 3500 PetscFunctionBegin; 3501 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3502 if (reuse == MAT_INITIAL_MATRIX) { 3503 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 3504 } else if (reuse == MAT_REUSE_MATRIX) { 3505 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 3506 } 3507 B = *newmat; 3508 3509 PetscCall(PetscFree(B->defaultvectype)); 3510 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 3511 3512 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3513 if (B->factortype == MAT_FACTOR_NONE) { 3514 Mat_SeqAIJCUSPARSE *spptr; 3515 PetscCall(PetscNew(&spptr)); 3516 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3517 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3518 spptr->format = MAT_CUSPARSE_CSR; 3519 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3520 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3521 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3522 #else 3523 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3524 #endif 3525 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3526 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3527 #endif 3528 B->spptr = spptr; 3529 } else { 3530 Mat_SeqAIJCUSPARSETriFactors *spptr; 3531 3532 PetscCall(PetscNew(&spptr)); 3533 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3534 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3535 B->spptr = spptr; 3536 } 3537 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3538 } 3539 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3540 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3541 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3542 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3543 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3544 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3545 3546 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 3547 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 3548 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3549 #if defined(PETSC_HAVE_HYPRE) 3550 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3551 #endif 3552 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3553 PetscFunctionReturn(0); 3554 } 3555 3556 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3557 { 3558 PetscFunctionBegin; 3559 PetscCall(MatCreate_SeqAIJ(B)); 3560 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 3561 PetscFunctionReturn(0); 3562 } 3563 3564 /*MC 3565 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3566 3567 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3568 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3569 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3570 3571 Options Database Keys: 3572 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3573 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3574 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3575 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3576 3577 Level: beginner 3578 3579 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3580 M*/ 3581 3582 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3583 3584 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3585 { 3586 PetscFunctionBegin; 3587 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 3588 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 3589 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 3590 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 3591 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3592 3593 PetscFunctionReturn(0); 3594 } 3595 3596 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3597 { 3598 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3599 3600 PetscFunctionBegin; 3601 if (!cusp) PetscFunctionReturn(0); 3602 delete cusp->cooPerm; 3603 delete cusp->cooPerm_a; 3604 cusp->cooPerm = NULL; 3605 cusp->cooPerm_a = NULL; 3606 if (cusp->use_extended_coo) { 3607 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3608 PetscCallCUDA(cudaFree(cusp->perm_d)); 3609 } 3610 cusp->use_extended_coo = PETSC_FALSE; 3611 PetscFunctionReturn(0); 3612 } 3613 3614 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3615 { 3616 PetscFunctionBegin; 3617 if (*cusparsestruct) { 3618 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 3619 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 3620 delete (*cusparsestruct)->workVector; 3621 delete (*cusparsestruct)->rowoffsets_gpu; 3622 delete (*cusparsestruct)->cooPerm; 3623 delete (*cusparsestruct)->cooPerm_a; 3624 delete (*cusparsestruct)->csr2csc_i; 3625 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3626 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3627 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3628 PetscCall(PetscFree(*cusparsestruct)); 3629 } 3630 PetscFunctionReturn(0); 3631 } 3632 3633 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3634 { 3635 PetscFunctionBegin; 3636 if (*mat) { 3637 delete (*mat)->values; 3638 delete (*mat)->column_indices; 3639 delete (*mat)->row_offsets; 3640 delete *mat; 3641 *mat = 0; 3642 } 3643 PetscFunctionReturn(0); 3644 } 3645 3646 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3647 { 3648 PetscFunctionBegin; 3649 if (*trifactor) { 3650 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3651 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 3652 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3653 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3654 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3655 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3656 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3657 #endif 3658 PetscCall(PetscFree(*trifactor)); 3659 } 3660 PetscFunctionReturn(0); 3661 } 3662 3663 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3664 { 3665 CsrMatrix *mat; 3666 3667 PetscFunctionBegin; 3668 if (*matstruct) { 3669 if ((*matstruct)->mat) { 3670 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3671 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3672 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3673 #else 3674 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3675 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3676 #endif 3677 } else { 3678 mat = (CsrMatrix*)(*matstruct)->mat; 3679 CsrMatrix_Destroy(&mat); 3680 } 3681 } 3682 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3683 delete (*matstruct)->cprowIndices; 3684 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3685 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3686 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3687 3688 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3689 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3690 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3691 for (int i=0; i<3; i++) { 3692 if (mdata->cuSpMV[i].initialized) { 3693 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3694 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3695 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3696 } 3697 } 3698 #endif 3699 delete *matstruct; 3700 *matstruct = NULL; 3701 } 3702 PetscFunctionReturn(0); 3703 } 3704 3705 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3706 { 3707 PetscFunctionBegin; 3708 if (*trifactors) { 3709 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 3710 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 3711 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 3712 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 3713 delete (*trifactors)->rpermIndices; 3714 delete (*trifactors)->cpermIndices; 3715 delete (*trifactors)->workVector; 3716 (*trifactors)->rpermIndices = NULL; 3717 (*trifactors)->cpermIndices = NULL; 3718 (*trifactors)->workVector = NULL; 3719 if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 3720 if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3721 (*trifactors)->init_dev_prop = PETSC_FALSE; 3722 } 3723 PetscFunctionReturn(0); 3724 } 3725 3726 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3727 { 3728 cusparseHandle_t handle; 3729 3730 PetscFunctionBegin; 3731 if (*trifactors) { 3732 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3733 if (handle = (*trifactors)->handle) { 3734 PetscCallCUSPARSE(cusparseDestroy(handle)); 3735 } 3736 PetscCall(PetscFree(*trifactors)); 3737 } 3738 PetscFunctionReturn(0); 3739 } 3740 3741 struct IJCompare 3742 { 3743 __host__ __device__ 3744 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3745 { 3746 if (t1.get<0>() < t2.get<0>()) return true; 3747 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3748 return false; 3749 } 3750 }; 3751 3752 struct IJEqual 3753 { 3754 __host__ __device__ 3755 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3756 { 3757 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3758 return true; 3759 } 3760 }; 3761 3762 struct IJDiff 3763 { 3764 __host__ __device__ 3765 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3766 { 3767 return t1 == t2 ? 0 : 1; 3768 } 3769 }; 3770 3771 struct IJSum 3772 { 3773 __host__ __device__ 3774 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3775 { 3776 return t1||t2; 3777 } 3778 }; 3779 3780 #include <thrust/iterator/discard_iterator.h> 3781 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3782 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3783 { 3784 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3785 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3786 THRUSTARRAY *cooPerm_v = NULL; 3787 thrust::device_ptr<const PetscScalar> d_v; 3788 CsrMatrix *matrix; 3789 PetscInt n; 3790 3791 PetscFunctionBegin; 3792 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3793 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3794 if (!cusp->cooPerm) { 3795 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 3796 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 3797 PetscFunctionReturn(0); 3798 } 3799 matrix = (CsrMatrix*)cusp->mat->mat; 3800 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3801 if (!v) { 3802 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3803 goto finalize; 3804 } 3805 n = cusp->cooPerm->size(); 3806 if (isCudaMem(v)) { 3807 d_v = thrust::device_pointer_cast(v); 3808 } else { 3809 cooPerm_v = new THRUSTARRAY(n); 3810 cooPerm_v->assign(v,v+n); 3811 d_v = cooPerm_v->data(); 3812 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 3813 } 3814 PetscCall(PetscLogGpuTimeBegin()); 3815 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3816 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3817 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3818 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3819 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3820 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3821 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3822 */ 3823 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3824 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3825 delete cooPerm_w; 3826 } else { 3827 /* all nonzeros in d_v[] are unique entries */ 3828 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3829 matrix->values->begin())); 3830 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3831 matrix->values->end())); 3832 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3833 } 3834 } else { 3835 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3836 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3837 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3838 } else { 3839 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3840 matrix->values->begin())); 3841 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3842 matrix->values->end())); 3843 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3844 } 3845 } 3846 PetscCall(PetscLogGpuTimeEnd()); 3847 finalize: 3848 delete cooPerm_v; 3849 A->offloadmask = PETSC_OFFLOAD_GPU; 3850 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3851 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3852 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 3853 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 3854 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3855 a->reallocs = 0; 3856 A->info.mallocs += 0; 3857 A->info.nz_unneeded = 0; 3858 A->assembled = A->was_assembled = PETSC_TRUE; 3859 A->num_ass++; 3860 PetscFunctionReturn(0); 3861 } 3862 3863 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3864 { 3865 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3866 3867 PetscFunctionBegin; 3868 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3869 if (!cusp) PetscFunctionReturn(0); 3870 if (destroy) { 3871 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3872 delete cusp->csr2csc_i; 3873 cusp->csr2csc_i = NULL; 3874 } 3875 A->transupdated = PETSC_FALSE; 3876 PetscFunctionReturn(0); 3877 } 3878 3879 #include <thrust/binary_search.h> 3880 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3881 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3882 { 3883 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3884 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3885 PetscInt cooPerm_n, nzr = 0; 3886 3887 PetscFunctionBegin; 3888 PetscCall(PetscLayoutSetUp(A->rmap)); 3889 PetscCall(PetscLayoutSetUp(A->cmap)); 3890 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3891 if (n != cooPerm_n) { 3892 delete cusp->cooPerm; 3893 delete cusp->cooPerm_a; 3894 cusp->cooPerm = NULL; 3895 cusp->cooPerm_a = NULL; 3896 } 3897 if (n) { 3898 THRUSTINTARRAY d_i(n); 3899 THRUSTINTARRAY d_j(n); 3900 THRUSTINTARRAY ii(A->rmap->n); 3901 3902 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3903 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3904 3905 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 3906 d_i.assign(coo_i,coo_i+n); 3907 d_j.assign(coo_j,coo_j+n); 3908 3909 /* Ex. 3910 n = 6 3911 coo_i = [3,3,1,4,1,4] 3912 coo_j = [3,2,2,5,2,6] 3913 */ 3914 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3915 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3916 3917 PetscCall(PetscLogGpuTimeBegin()); 3918 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3919 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3920 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3921 THRUSTINTARRAY w = d_j; 3922 3923 /* 3924 d_i = [1,1,3,3,4,4] 3925 d_j = [2,2,2,3,5,6] 3926 cooPerm = [2,4,1,0,3,5] 3927 */ 3928 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3929 3930 /* 3931 d_i = [1,3,3,4,4,x] 3932 ^ekey 3933 d_j = [2,2,3,5,6,x] 3934 ^nekye 3935 */ 3936 if (nekey == ekey) { /* all entries are unique */ 3937 delete cusp->cooPerm_a; 3938 cusp->cooPerm_a = NULL; 3939 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3940 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3941 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3942 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3943 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3944 w[0] = 0; 3945 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3946 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3947 } 3948 thrust::counting_iterator<PetscInt> search_begin(0); 3949 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3950 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3951 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3952 PetscCall(PetscLogGpuTimeEnd()); 3953 3954 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 3955 a->singlemalloc = PETSC_FALSE; 3956 a->free_a = PETSC_TRUE; 3957 a->free_ij = PETSC_TRUE; 3958 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3959 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3960 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3961 a->nz = a->maxnz = a->i[A->rmap->n]; 3962 a->rmax = 0; 3963 PetscCall(PetscMalloc1(a->nz,&a->a)); 3964 PetscCall(PetscMalloc1(a->nz,&a->j)); 3965 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3966 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 3967 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 3968 for (PetscInt i = 0; i < A->rmap->n; i++) { 3969 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3970 nzr += (PetscInt)!!(nnzr); 3971 a->ilen[i] = a->imax[i] = nnzr; 3972 a->rmax = PetscMax(a->rmax,nnzr); 3973 } 3974 a->nonzerorowcnt = nzr; 3975 A->preallocated = PETSC_TRUE; 3976 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 3977 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 3978 } else { 3979 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 3980 } 3981 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 3982 3983 /* We want to allocate the CUSPARSE struct for matvec now. 3984 The code is so convoluted now that I prefer to copy zeros */ 3985 PetscCall(PetscArrayzero(a->a,a->nz)); 3986 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 3987 A->offloadmask = PETSC_OFFLOAD_CPU; 3988 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3989 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 3990 PetscFunctionReturn(0); 3991 } 3992 3993 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3994 { 3995 Mat_SeqAIJ *seq; 3996 Mat_SeqAIJCUSPARSE *dev; 3997 PetscBool coo_basic = PETSC_TRUE; 3998 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 3999 4000 PetscFunctionBegin; 4001 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4002 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4003 if (coo_i) { 4004 PetscCall(PetscGetMemType(coo_i,&mtype)); 4005 if (PetscMemTypeHost(mtype)) { 4006 for (PetscCount k=0; k<coo_n; k++) { 4007 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4008 } 4009 } 4010 } 4011 4012 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4013 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4014 } else { 4015 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4016 mat->offloadmask = PETSC_OFFLOAD_CPU; 4017 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4018 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4019 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4020 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4021 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4022 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4023 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4024 dev->use_extended_coo = PETSC_TRUE; 4025 } 4026 PetscFunctionReturn(0); 4027 } 4028 4029 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4030 { 4031 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4032 const PetscCount grid_size = gridDim.x * blockDim.x; 4033 for (; i<nnz; i+= grid_size) { 4034 PetscScalar sum = 0.0; 4035 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4036 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4037 } 4038 } 4039 4040 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4041 { 4042 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4043 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4044 PetscCount Annz = seq->nz; 4045 PetscMemType memtype; 4046 const PetscScalar *v1 = v; 4047 PetscScalar *Aa; 4048 4049 PetscFunctionBegin; 4050 if (dev->use_extended_coo) { 4051 PetscCall(PetscGetMemType(v,&memtype)); 4052 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4053 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4054 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4055 } 4056 4057 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4058 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4059 4060 if (Annz) { 4061 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4062 PetscCallCUDA(cudaPeekAtLastError()); 4063 } 4064 4065 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4066 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4067 4068 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4069 } else { 4070 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4071 } 4072 PetscFunctionReturn(0); 4073 } 4074 4075 /*@C 4076 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4077 4078 Not collective 4079 4080 Input Parameters: 4081 + A - the matrix 4082 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4083 4084 Output Parameters: 4085 + ia - the CSR row pointers 4086 - ja - the CSR column indices 4087 4088 Level: developer 4089 4090 Notes: 4091 When compressed is true, the CSR structure does not contain empty rows 4092 4093 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4094 @*/ 4095 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4096 { 4097 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4098 CsrMatrix *csr; 4099 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4100 4101 PetscFunctionBegin; 4102 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4103 if (!i || !j) PetscFunctionReturn(0); 4104 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4105 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4106 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4107 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4108 csr = (CsrMatrix*)cusp->mat->mat; 4109 if (i) { 4110 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4111 if (!cusp->rowoffsets_gpu) { 4112 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4113 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4114 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4115 } 4116 *i = cusp->rowoffsets_gpu->data().get(); 4117 } else *i = csr->row_offsets->data().get(); 4118 } 4119 if (j) *j = csr->column_indices->data().get(); 4120 PetscFunctionReturn(0); 4121 } 4122 4123 /*@C 4124 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4125 4126 Not collective 4127 4128 Input Parameters: 4129 + A - the matrix 4130 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4131 4132 Output Parameters: 4133 + ia - the CSR row pointers 4134 - ja - the CSR column indices 4135 4136 Level: developer 4137 4138 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4139 @*/ 4140 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4141 { 4142 PetscFunctionBegin; 4143 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4144 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4145 if (i) *i = NULL; 4146 if (j) *j = NULL; 4147 PetscFunctionReturn(0); 4148 } 4149 4150 /*@C 4151 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4152 4153 Not Collective 4154 4155 Input Parameter: 4156 . A - a MATSEQAIJCUSPARSE matrix 4157 4158 Output Parameter: 4159 . a - pointer to the device data 4160 4161 Level: developer 4162 4163 Notes: may trigger host-device copies if up-to-date matrix data is on host 4164 4165 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4166 @*/ 4167 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4168 { 4169 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4170 CsrMatrix *csr; 4171 4172 PetscFunctionBegin; 4173 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4174 PetscValidPointer(a,2); 4175 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4176 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4177 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4178 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4179 csr = (CsrMatrix*)cusp->mat->mat; 4180 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4181 *a = csr->values->data().get(); 4182 PetscFunctionReturn(0); 4183 } 4184 4185 /*@C 4186 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4187 4188 Not Collective 4189 4190 Input Parameter: 4191 . A - a MATSEQAIJCUSPARSE matrix 4192 4193 Output Parameter: 4194 . a - pointer to the device data 4195 4196 Level: developer 4197 4198 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4199 @*/ 4200 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4201 { 4202 PetscFunctionBegin; 4203 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4204 PetscValidPointer(a,2); 4205 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4206 *a = NULL; 4207 PetscFunctionReturn(0); 4208 } 4209 4210 /*@C 4211 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4212 4213 Not Collective 4214 4215 Input Parameter: 4216 . A - a MATSEQAIJCUSPARSE matrix 4217 4218 Output Parameter: 4219 . a - pointer to the device data 4220 4221 Level: developer 4222 4223 Notes: may trigger host-device copies if up-to-date matrix data is on host 4224 4225 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4226 @*/ 4227 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4228 { 4229 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4230 CsrMatrix *csr; 4231 4232 PetscFunctionBegin; 4233 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4234 PetscValidPointer(a,2); 4235 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4236 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4237 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4238 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4239 csr = (CsrMatrix*)cusp->mat->mat; 4240 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4241 *a = csr->values->data().get(); 4242 A->offloadmask = PETSC_OFFLOAD_GPU; 4243 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4244 PetscFunctionReturn(0); 4245 } 4246 /*@C 4247 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4248 4249 Not Collective 4250 4251 Input Parameter: 4252 . A - a MATSEQAIJCUSPARSE matrix 4253 4254 Output Parameter: 4255 . a - pointer to the device data 4256 4257 Level: developer 4258 4259 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4260 @*/ 4261 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4262 { 4263 PetscFunctionBegin; 4264 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4265 PetscValidPointer(a,2); 4266 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4267 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4268 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4269 *a = NULL; 4270 PetscFunctionReturn(0); 4271 } 4272 4273 /*@C 4274 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4275 4276 Not Collective 4277 4278 Input Parameter: 4279 . A - a MATSEQAIJCUSPARSE matrix 4280 4281 Output Parameter: 4282 . a - pointer to the device data 4283 4284 Level: developer 4285 4286 Notes: does not trigger host-device copies and flags data validity on the GPU 4287 4288 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4289 @*/ 4290 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4291 { 4292 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4293 CsrMatrix *csr; 4294 4295 PetscFunctionBegin; 4296 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4297 PetscValidPointer(a,2); 4298 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4299 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4300 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4301 csr = (CsrMatrix*)cusp->mat->mat; 4302 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4303 *a = csr->values->data().get(); 4304 A->offloadmask = PETSC_OFFLOAD_GPU; 4305 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4306 PetscFunctionReturn(0); 4307 } 4308 4309 /*@C 4310 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4311 4312 Not Collective 4313 4314 Input Parameter: 4315 . A - a MATSEQAIJCUSPARSE matrix 4316 4317 Output Parameter: 4318 . a - pointer to the device data 4319 4320 Level: developer 4321 4322 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4323 @*/ 4324 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4325 { 4326 PetscFunctionBegin; 4327 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4328 PetscValidPointer(a,2); 4329 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4330 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4331 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4332 *a = NULL; 4333 PetscFunctionReturn(0); 4334 } 4335 4336 struct IJCompare4 4337 { 4338 __host__ __device__ 4339 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4340 { 4341 if (t1.get<0>() < t2.get<0>()) return true; 4342 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4343 return false; 4344 } 4345 }; 4346 4347 struct Shift 4348 { 4349 int _shift; 4350 4351 Shift(int shift) : _shift(shift) {} 4352 __host__ __device__ 4353 inline int operator() (const int &c) 4354 { 4355 return c + _shift; 4356 } 4357 }; 4358 4359 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4360 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4361 { 4362 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4363 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4364 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4365 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4366 PetscInt Annz,Bnnz; 4367 cusparseStatus_t stat; 4368 PetscInt i,m,n,zero = 0; 4369 4370 PetscFunctionBegin; 4371 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4372 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4373 PetscValidPointer(C,4); 4374 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4375 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4376 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4377 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4378 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4379 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4380 if (reuse == MAT_INITIAL_MATRIX) { 4381 m = A->rmap->n; 4382 n = A->cmap->n + B->cmap->n; 4383 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 4384 PetscCall(MatSetSizes(*C,m,n,m,n)); 4385 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4386 c = (Mat_SeqAIJ*)(*C)->data; 4387 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4388 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4389 Ccsr = new CsrMatrix; 4390 Cmat->cprowIndices = NULL; 4391 c->compressedrow.use = PETSC_FALSE; 4392 c->compressedrow.nrows = 0; 4393 c->compressedrow.i = NULL; 4394 c->compressedrow.rindex = NULL; 4395 Ccusp->workVector = NULL; 4396 Ccusp->nrows = m; 4397 Ccusp->mat = Cmat; 4398 Ccusp->mat->mat = Ccsr; 4399 Ccsr->num_rows = m; 4400 Ccsr->num_cols = n; 4401 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4402 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4403 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4404 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 4405 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 4406 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4407 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4408 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4409 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4410 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4411 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4412 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4413 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4414 4415 Acsr = (CsrMatrix*)Acusp->mat->mat; 4416 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4417 Annz = (PetscInt)Acsr->column_indices->size(); 4418 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4419 c->nz = Annz + Bnnz; 4420 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4421 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4422 Ccsr->values = new THRUSTARRAY(c->nz); 4423 Ccsr->num_entries = c->nz; 4424 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4425 if (c->nz) { 4426 auto Acoo = new THRUSTINTARRAY32(Annz); 4427 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4428 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4429 THRUSTINTARRAY32 *Aroff,*Broff; 4430 4431 if (a->compressedrow.use) { /* need full row offset */ 4432 if (!Acusp->rowoffsets_gpu) { 4433 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4434 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4435 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4436 } 4437 Aroff = Acusp->rowoffsets_gpu; 4438 } else Aroff = Acsr->row_offsets; 4439 if (b->compressedrow.use) { /* need full row offset */ 4440 if (!Bcusp->rowoffsets_gpu) { 4441 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4442 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4443 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4444 } 4445 Broff = Bcusp->rowoffsets_gpu; 4446 } else Broff = Bcsr->row_offsets; 4447 PetscCall(PetscLogGpuTimeBegin()); 4448 stat = cusparseXcsr2coo(Acusp->handle, 4449 Aroff->data().get(), 4450 Annz, 4451 m, 4452 Acoo->data().get(), 4453 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4454 stat = cusparseXcsr2coo(Bcusp->handle, 4455 Broff->data().get(), 4456 Bnnz, 4457 m, 4458 Bcoo->data().get(), 4459 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4460 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4461 auto Aperm = thrust::make_constant_iterator(1); 4462 auto Bperm = thrust::make_constant_iterator(0); 4463 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4464 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4465 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4466 #else 4467 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4468 auto Bcib = Bcsr->column_indices->begin(); 4469 auto Bcie = Bcsr->column_indices->end(); 4470 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4471 #endif 4472 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4473 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4474 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4475 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4476 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4477 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4478 auto p1 = Ccusp->cooPerm->begin(); 4479 auto p2 = Ccusp->cooPerm->begin(); 4480 thrust::advance(p2,Annz); 4481 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4482 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4483 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4484 #endif 4485 auto cci = thrust::make_counting_iterator(zero); 4486 auto cce = thrust::make_counting_iterator(c->nz); 4487 #if 0 //Errors on SUMMIT cuda 11.1.0 4488 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4489 #else 4490 auto pred = thrust::identity<int>(); 4491 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4492 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4493 #endif 4494 stat = cusparseXcoo2csr(Ccusp->handle, 4495 Ccoo->data().get(), 4496 c->nz, 4497 m, 4498 Ccsr->row_offsets->data().get(), 4499 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4500 PetscCall(PetscLogGpuTimeEnd()); 4501 delete wPerm; 4502 delete Acoo; 4503 delete Bcoo; 4504 delete Ccoo; 4505 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4506 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4507 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4508 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4509 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4510 #endif 4511 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4512 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4513 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4514 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4515 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4516 CsrMatrix *CcsrT = new CsrMatrix; 4517 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4518 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4519 4520 (*C)->form_explicit_transpose = PETSC_TRUE; 4521 (*C)->transupdated = PETSC_TRUE; 4522 Ccusp->rowoffsets_gpu = NULL; 4523 CmatT->cprowIndices = NULL; 4524 CmatT->mat = CcsrT; 4525 CcsrT->num_rows = n; 4526 CcsrT->num_cols = m; 4527 CcsrT->num_entries = c->nz; 4528 4529 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4530 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4531 CcsrT->values = new THRUSTARRAY(c->nz); 4532 4533 PetscCall(PetscLogGpuTimeBegin()); 4534 auto rT = CcsrT->row_offsets->begin(); 4535 if (AT) { 4536 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4537 thrust::advance(rT,-1); 4538 } 4539 if (BT) { 4540 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4541 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4542 thrust::copy(titb,tite,rT); 4543 } 4544 auto cT = CcsrT->column_indices->begin(); 4545 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4546 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4547 auto vT = CcsrT->values->begin(); 4548 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4549 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4550 PetscCall(PetscLogGpuTimeEnd()); 4551 4552 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4553 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4554 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4555 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 4556 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 4557 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4558 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4559 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4560 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4561 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4562 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4563 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4564 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4565 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4566 #endif 4567 Ccusp->matTranspose = CmatT; 4568 } 4569 } 4570 4571 c->singlemalloc = PETSC_FALSE; 4572 c->free_a = PETSC_TRUE; 4573 c->free_ij = PETSC_TRUE; 4574 PetscCall(PetscMalloc1(m+1,&c->i)); 4575 PetscCall(PetscMalloc1(c->nz,&c->j)); 4576 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4577 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4578 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4579 ii = *Ccsr->row_offsets; 4580 jj = *Ccsr->column_indices; 4581 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4582 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4583 } else { 4584 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4585 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4586 } 4587 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 4588 PetscCall(PetscMalloc1(m,&c->ilen)); 4589 PetscCall(PetscMalloc1(m,&c->imax)); 4590 c->maxnz = c->nz; 4591 c->nonzerorowcnt = 0; 4592 c->rmax = 0; 4593 for (i = 0; i < m; i++) { 4594 const PetscInt nn = c->i[i+1] - c->i[i]; 4595 c->ilen[i] = c->imax[i] = nn; 4596 c->nonzerorowcnt += (PetscInt)!!nn; 4597 c->rmax = PetscMax(c->rmax,nn); 4598 } 4599 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4600 PetscCall(PetscMalloc1(c->nz,&c->a)); 4601 (*C)->nonzerostate++; 4602 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4603 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4604 Ccusp->nonzerostate = (*C)->nonzerostate; 4605 (*C)->preallocated = PETSC_TRUE; 4606 } else { 4607 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4608 c = (Mat_SeqAIJ*)(*C)->data; 4609 if (c->nz) { 4610 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4611 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4612 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4613 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4614 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4615 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4616 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4617 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4618 Acsr = (CsrMatrix*)Acusp->mat->mat; 4619 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4620 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4621 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4622 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4623 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4624 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4625 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4626 auto pmid = Ccusp->cooPerm->begin(); 4627 thrust::advance(pmid,Acsr->num_entries); 4628 PetscCall(PetscLogGpuTimeBegin()); 4629 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4630 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4631 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4632 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4633 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4634 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4635 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4636 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4637 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4638 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4639 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 4640 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4641 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4642 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4643 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4644 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4645 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4646 auto vT = CcsrT->values->begin(); 4647 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4648 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4649 (*C)->transupdated = PETSC_TRUE; 4650 } 4651 PetscCall(PetscLogGpuTimeEnd()); 4652 } 4653 } 4654 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4655 (*C)->assembled = PETSC_TRUE; 4656 (*C)->was_assembled = PETSC_FALSE; 4657 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4658 PetscFunctionReturn(0); 4659 } 4660 4661 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4662 { 4663 bool dmem; 4664 const PetscScalar *av; 4665 4666 PetscFunctionBegin; 4667 dmem = isCudaMem(v); 4668 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4669 if (n && idx) { 4670 THRUSTINTARRAY widx(n); 4671 widx.assign(idx,idx+n); 4672 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4673 4674 THRUSTARRAY *w = NULL; 4675 thrust::device_ptr<PetscScalar> dv; 4676 if (dmem) { 4677 dv = thrust::device_pointer_cast(v); 4678 } else { 4679 w = new THRUSTARRAY(n); 4680 dv = w->data(); 4681 } 4682 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4683 4684 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4685 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4686 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4687 if (w) { 4688 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4689 } 4690 delete w; 4691 } else { 4692 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4693 } 4694 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4695 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4696 PetscFunctionReturn(0); 4697 } 4698