1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 96 { 97 PetscFunctionBegin; 98 *type = MATSOLVERCUSPARSE; 99 PetscFunctionReturn(0); 100 } 101 102 /*MC 103 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 104 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 105 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 106 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 107 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 108 algorithms are not recommended. This class does NOT support direct solver operations. 109 110 Level: beginner 111 112 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 113 M*/ 114 115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 116 { 117 PetscInt n = A->rmap->n; 118 119 PetscFunctionBegin; 120 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 121 PetscCall(MatSetSizes(*B,n,n,n,n)); 122 (*B)->factortype = ftype; 123 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 124 125 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 126 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 127 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 128 if (!A->boundtocpu) { 129 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 130 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 131 } else { 132 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 133 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 134 } 135 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 136 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 137 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 138 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 139 if (!A->boundtocpu) { 140 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 141 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 142 } else { 143 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 144 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 145 } 146 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 147 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 148 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 149 150 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 151 (*B)->canuseordering = PETSC_TRUE; 152 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 153 PetscFunctionReturn(0); 154 } 155 156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 157 { 158 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 159 160 PetscFunctionBegin; 161 switch (op) { 162 case MAT_CUSPARSE_MULT: 163 cusparsestruct->format = format; 164 break; 165 case MAT_CUSPARSE_ALL: 166 cusparsestruct->format = format; 167 break; 168 default: 169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 170 } 171 PetscFunctionReturn(0); 172 } 173 174 /*@ 175 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 176 operation. Only the MatMult operation can use different GPU storage formats 177 for MPIAIJCUSPARSE matrices. 178 Not Collective 179 180 Input Parameters: 181 + A - Matrix of type SEQAIJCUSPARSE 182 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 183 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 184 185 Output Parameter: 186 187 Level: intermediate 188 189 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 190 @*/ 191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 192 { 193 PetscFunctionBegin; 194 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 195 PetscCall(PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format))); 196 PetscFunctionReturn(0); 197 } 198 199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 200 { 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202 203 PetscFunctionBegin; 204 cusparsestruct->use_cpu_solve = use_cpu; 205 PetscFunctionReturn(0); 206 } 207 208 /*@ 209 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 210 211 Input Parameters: 212 + A - Matrix of type SEQAIJCUSPARSE 213 - use_cpu - set flag for using the built-in CPU MatSolve 214 215 Output Parameter: 216 217 Notes: 218 The cuSparse LU solver currently computes the factors with the built-in CPU method 219 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 220 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 221 222 Level: intermediate 223 224 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 225 @*/ 226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 227 { 228 PetscFunctionBegin; 229 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 230 PetscCall(PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu))); 231 PetscFunctionReturn(0); 232 } 233 234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 235 { 236 PetscFunctionBegin; 237 switch (op) { 238 case MAT_FORM_EXPLICIT_TRANSPOSE: 239 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 240 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 241 A->form_explicit_transpose = flg; 242 break; 243 default: 244 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 245 break; 246 } 247 PetscFunctionReturn(0); 248 } 249 250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 251 252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 253 { 254 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 255 IS isrow = b->row,iscol = b->col; 256 PetscBool row_identity,col_identity; 257 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 258 259 PetscFunctionBegin; 260 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 261 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 262 B->offloadmask = PETSC_OFFLOAD_CPU; 263 /* determine which version of MatSolve needs to be used. */ 264 PetscCall(ISIdentity(isrow,&row_identity)); 265 PetscCall(ISIdentity(iscol,&col_identity)); 266 if (row_identity && col_identity) { 267 if (!cusparsestruct->use_cpu_solve) { 268 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 269 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 270 } 271 B->ops->matsolve = NULL; 272 B->ops->matsolvetranspose = NULL; 273 } else { 274 if (!cusparsestruct->use_cpu_solve) { 275 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 276 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 277 } 278 B->ops->matsolve = NULL; 279 B->ops->matsolvetranspose = NULL; 280 } 281 282 /* get the triangular factors */ 283 if (!cusparsestruct->use_cpu_solve) { 284 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 285 } 286 PetscFunctionReturn(0); 287 } 288 289 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 290 { 291 PetscErrorCode ierr; 292 MatCUSPARSEStorageFormat format; 293 PetscBool flg; 294 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 295 296 PetscFunctionBegin; 297 PetscCall(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options")); 298 if (A->factortype == MAT_FACTOR_NONE) { 299 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 300 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr); 301 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 302 303 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 304 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr); 305 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 306 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 307 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 309 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 310 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);PetscCall(ierr); 311 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 312 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 313 PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 314 #else 315 PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 316 #endif 317 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 318 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);PetscCall(ierr); 319 PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 320 321 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 322 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);PetscCall(ierr); 323 PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 324 #endif 325 } 326 PetscCall(PetscOptionsTail()); 327 PetscFunctionReturn(0); 328 } 329 330 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 331 { 332 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 333 334 PetscFunctionBegin; 335 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 336 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 337 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 342 { 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344 345 PetscFunctionBegin; 346 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 347 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 348 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 349 PetscFunctionReturn(0); 350 } 351 352 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 353 { 354 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 355 356 PetscFunctionBegin; 357 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 358 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 359 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 364 { 365 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 366 367 PetscFunctionBegin; 368 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 369 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 370 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 371 PetscFunctionReturn(0); 372 } 373 374 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 375 { 376 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 377 PetscInt n = A->rmap->n; 378 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 379 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 380 const PetscInt *ai = a->i,*aj = a->j,*vi; 381 const MatScalar *aa = a->a,*v; 382 PetscInt *AiLo, *AjLo; 383 PetscInt i,nz, nzLower, offset, rowOffset; 384 385 PetscFunctionBegin; 386 if (!n) PetscFunctionReturn(0); 387 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 388 try { 389 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 390 nzLower=n+ai[n]-ai[1]; 391 if (!loTriFactor) { 392 PetscScalar *AALo; 393 394 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 395 396 /* Allocate Space for the lower triangular matrix */ 397 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 398 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 399 400 /* Fill the lower triangular matrix */ 401 AiLo[0] = (PetscInt) 0; 402 AiLo[n] = nzLower; 403 AjLo[0] = (PetscInt) 0; 404 AALo[0] = (MatScalar) 1.0; 405 v = aa; 406 vi = aj; 407 offset = 1; 408 rowOffset= 1; 409 for (i=1; i<n; i++) { 410 nz = ai[i+1] - ai[i]; 411 /* additional 1 for the term on the diagonal */ 412 AiLo[i] = rowOffset; 413 rowOffset += nz+1; 414 415 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 416 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 417 418 offset += nz; 419 AjLo[offset] = (PetscInt) i; 420 AALo[offset] = (MatScalar) 1.0; 421 offset += 1; 422 423 v += nz; 424 vi += nz; 425 } 426 427 /* allocate space for the triangular factor information */ 428 PetscCall(PetscNew(&loTriFactor)); 429 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 430 /* Create the matrix description */ 431 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 432 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 433 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 434 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 435 #else 436 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 437 #endif 438 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 439 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 440 441 /* set the operation */ 442 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 443 444 /* set the matrix */ 445 loTriFactor->csrMat = new CsrMatrix; 446 loTriFactor->csrMat->num_rows = n; 447 loTriFactor->csrMat->num_cols = n; 448 loTriFactor->csrMat->num_entries = nzLower; 449 450 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 451 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 452 453 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 454 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 455 456 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 457 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 458 459 /* Create the solve analysis information */ 460 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 461 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 462 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 463 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 464 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 465 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 466 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 467 &loTriFactor->solveBufferSize)); 468 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 469 #endif 470 471 /* perform the solve analysis */ 472 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 473 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 474 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 475 loTriFactor->csrMat->column_indices->data().get(), 476 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 477 loTriFactor->solveInfo, 478 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 479 #else 480 loTriFactor->solveInfo)); 481 #endif 482 PetscCallCUDA(WaitForCUDA()); 483 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 484 485 /* assign the pointer */ 486 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 487 loTriFactor->AA_h = AALo; 488 PetscCallCUDA(cudaFreeHost(AiLo)); 489 PetscCallCUDA(cudaFreeHost(AjLo)); 490 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 491 } else { /* update values only */ 492 if (!loTriFactor->AA_h) { 493 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 494 } 495 /* Fill the lower triangular matrix */ 496 loTriFactor->AA_h[0] = 1.0; 497 v = aa; 498 vi = aj; 499 offset = 1; 500 for (i=1; i<n; i++) { 501 nz = ai[i+1] - ai[i]; 502 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 503 offset += nz; 504 loTriFactor->AA_h[offset] = 1.0; 505 offset += 1; 506 v += nz; 507 } 508 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 509 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 510 } 511 } catch(char *ex) { 512 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 513 } 514 } 515 PetscFunctionReturn(0); 516 } 517 518 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 519 { 520 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 521 PetscInt n = A->rmap->n; 522 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 523 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 524 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 525 const MatScalar *aa = a->a,*v; 526 PetscInt *AiUp, *AjUp; 527 PetscInt i,nz, nzUpper, offset; 528 529 PetscFunctionBegin; 530 if (!n) PetscFunctionReturn(0); 531 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 532 try { 533 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 534 nzUpper = adiag[0]-adiag[n]; 535 if (!upTriFactor) { 536 PetscScalar *AAUp; 537 538 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 539 540 /* Allocate Space for the upper triangular matrix */ 541 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 542 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 543 544 /* Fill the upper triangular matrix */ 545 AiUp[0]=(PetscInt) 0; 546 AiUp[n]=nzUpper; 547 offset = nzUpper; 548 for (i=n-1; i>=0; i--) { 549 v = aa + adiag[i+1] + 1; 550 vi = aj + adiag[i+1] + 1; 551 552 /* number of elements NOT on the diagonal */ 553 nz = adiag[i] - adiag[i+1]-1; 554 555 /* decrement the offset */ 556 offset -= (nz+1); 557 558 /* first, set the diagonal elements */ 559 AjUp[offset] = (PetscInt) i; 560 AAUp[offset] = (MatScalar)1./v[nz]; 561 AiUp[i] = AiUp[i+1] - (nz+1); 562 563 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 564 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 565 } 566 567 /* allocate space for the triangular factor information */ 568 PetscCall(PetscNew(&upTriFactor)); 569 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 570 571 /* Create the matrix description */ 572 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 573 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 575 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 576 #else 577 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 578 #endif 579 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 580 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 581 582 /* set the operation */ 583 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 584 585 /* set the matrix */ 586 upTriFactor->csrMat = new CsrMatrix; 587 upTriFactor->csrMat->num_rows = n; 588 upTriFactor->csrMat->num_cols = n; 589 upTriFactor->csrMat->num_entries = nzUpper; 590 591 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 592 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 593 594 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 595 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 596 597 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 598 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 599 600 /* Create the solve analysis information */ 601 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 602 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 604 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 605 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 606 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 607 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 608 &upTriFactor->solveBufferSize)); 609 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 610 #endif 611 612 /* perform the solve analysis */ 613 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 614 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 615 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 616 upTriFactor->csrMat->column_indices->data().get(), 617 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 618 upTriFactor->solveInfo, 619 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 620 #else 621 upTriFactor->solveInfo)); 622 #endif 623 PetscCallCUDA(WaitForCUDA()); 624 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 625 626 /* assign the pointer */ 627 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 628 upTriFactor->AA_h = AAUp; 629 PetscCallCUDA(cudaFreeHost(AiUp)); 630 PetscCallCUDA(cudaFreeHost(AjUp)); 631 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 632 } else { 633 if (!upTriFactor->AA_h) { 634 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 635 } 636 /* Fill the upper triangular matrix */ 637 offset = nzUpper; 638 for (i=n-1; i>=0; i--) { 639 v = aa + adiag[i+1] + 1; 640 641 /* number of elements NOT on the diagonal */ 642 nz = adiag[i] - adiag[i+1]-1; 643 644 /* decrement the offset */ 645 offset -= (nz+1); 646 647 /* first, set the diagonal elements */ 648 upTriFactor->AA_h[offset] = 1./v[nz]; 649 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 650 } 651 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 652 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 653 } 654 } catch(char *ex) { 655 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 656 } 657 } 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 662 { 663 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 664 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 665 IS isrow = a->row,iscol = a->icol; 666 PetscBool row_identity,col_identity; 667 PetscInt n = A->rmap->n; 668 669 PetscFunctionBegin; 670 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 671 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 672 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 673 674 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 675 cusparseTriFactors->nnz=a->nz; 676 677 A->offloadmask = PETSC_OFFLOAD_BOTH; 678 /* lower triangular indices */ 679 PetscCall(ISIdentity(isrow,&row_identity)); 680 if (!row_identity && !cusparseTriFactors->rpermIndices) { 681 const PetscInt *r; 682 683 PetscCall(ISGetIndices(isrow,&r)); 684 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 685 cusparseTriFactors->rpermIndices->assign(r, r+n); 686 PetscCall(ISRestoreIndices(isrow,&r)); 687 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 688 } 689 690 /* upper triangular indices */ 691 PetscCall(ISIdentity(iscol,&col_identity)); 692 if (!col_identity && !cusparseTriFactors->cpermIndices) { 693 const PetscInt *c; 694 695 PetscCall(ISGetIndices(iscol,&c)); 696 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 697 cusparseTriFactors->cpermIndices->assign(c, c+n); 698 PetscCall(ISRestoreIndices(iscol,&c)); 699 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 700 } 701 PetscFunctionReturn(0); 702 } 703 704 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 705 { 706 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 707 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 708 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 709 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 710 PetscInt *AiUp, *AjUp; 711 PetscScalar *AAUp; 712 PetscScalar *AALo; 713 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 714 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 715 const PetscInt *ai = b->i,*aj = b->j,*vj; 716 const MatScalar *aa = b->a,*v; 717 718 PetscFunctionBegin; 719 if (!n) PetscFunctionReturn(0); 720 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 721 try { 722 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 723 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 724 if (!upTriFactor && !loTriFactor) { 725 /* Allocate Space for the upper triangular matrix */ 726 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 727 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 728 729 /* Fill the upper triangular matrix */ 730 AiUp[0]=(PetscInt) 0; 731 AiUp[n]=nzUpper; 732 offset = 0; 733 for (i=0; i<n; i++) { 734 /* set the pointers */ 735 v = aa + ai[i]; 736 vj = aj + ai[i]; 737 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 738 739 /* first, set the diagonal elements */ 740 AjUp[offset] = (PetscInt) i; 741 AAUp[offset] = (MatScalar)1.0/v[nz]; 742 AiUp[i] = offset; 743 AALo[offset] = (MatScalar)1.0/v[nz]; 744 745 offset+=1; 746 if (nz>0) { 747 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 748 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 749 for (j=offset; j<offset+nz; j++) { 750 AAUp[j] = -AAUp[j]; 751 AALo[j] = AAUp[j]/v[nz]; 752 } 753 offset+=nz; 754 } 755 } 756 757 /* allocate space for the triangular factor information */ 758 PetscCall(PetscNew(&upTriFactor)); 759 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 760 761 /* Create the matrix description */ 762 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 763 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 764 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 765 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 766 #else 767 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 768 #endif 769 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 770 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 771 772 /* set the matrix */ 773 upTriFactor->csrMat = new CsrMatrix; 774 upTriFactor->csrMat->num_rows = A->rmap->n; 775 upTriFactor->csrMat->num_cols = A->cmap->n; 776 upTriFactor->csrMat->num_entries = a->nz; 777 778 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 779 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 780 781 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 782 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 783 784 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 785 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 786 787 /* set the operation */ 788 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 789 790 /* Create the solve analysis information */ 791 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 792 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 793 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 794 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 795 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 796 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 797 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 798 &upTriFactor->solveBufferSize)); 799 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 800 #endif 801 802 /* perform the solve analysis */ 803 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 804 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 805 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 806 upTriFactor->csrMat->column_indices->data().get(), 807 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 808 upTriFactor->solveInfo, 809 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 810 #else 811 upTriFactor->solveInfo)); 812 #endif 813 PetscCallCUDA(WaitForCUDA()); 814 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 815 816 /* assign the pointer */ 817 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 818 819 /* allocate space for the triangular factor information */ 820 PetscCall(PetscNew(&loTriFactor)); 821 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822 823 /* Create the matrix description */ 824 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 825 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 826 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 827 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 828 #else 829 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 830 #endif 831 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 832 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 833 834 /* set the operation */ 835 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 836 837 /* set the matrix */ 838 loTriFactor->csrMat = new CsrMatrix; 839 loTriFactor->csrMat->num_rows = A->rmap->n; 840 loTriFactor->csrMat->num_cols = A->cmap->n; 841 loTriFactor->csrMat->num_entries = a->nz; 842 843 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 844 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 845 846 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 847 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 848 849 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 850 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 851 852 /* Create the solve analysis information */ 853 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 854 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 855 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 856 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 857 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 858 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 859 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 860 &loTriFactor->solveBufferSize)); 861 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 862 #endif 863 864 /* perform the solve analysis */ 865 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 866 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 867 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 868 loTriFactor->csrMat->column_indices->data().get(), 869 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 870 loTriFactor->solveInfo, 871 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 872 #else 873 loTriFactor->solveInfo)); 874 #endif 875 PetscCallCUDA(WaitForCUDA()); 876 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 877 878 /* assign the pointer */ 879 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 880 881 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 882 PetscCallCUDA(cudaFreeHost(AiUp)); 883 PetscCallCUDA(cudaFreeHost(AjUp)); 884 } else { 885 /* Fill the upper triangular matrix */ 886 offset = 0; 887 for (i=0; i<n; i++) { 888 /* set the pointers */ 889 v = aa + ai[i]; 890 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 891 892 /* first, set the diagonal elements */ 893 AAUp[offset] = 1.0/v[nz]; 894 AALo[offset] = 1.0/v[nz]; 895 896 offset+=1; 897 if (nz>0) { 898 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 899 for (j=offset; j<offset+nz; j++) { 900 AAUp[j] = -AAUp[j]; 901 AALo[j] = AAUp[j]/v[nz]; 902 } 903 offset+=nz; 904 } 905 } 906 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 909 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 910 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 911 } 912 PetscCallCUDA(cudaFreeHost(AAUp)); 913 PetscCallCUDA(cudaFreeHost(AALo)); 914 } catch(char *ex) { 915 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 916 } 917 } 918 PetscFunctionReturn(0); 919 } 920 921 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 922 { 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 924 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 925 IS ip = a->row; 926 PetscBool perm_identity; 927 PetscInt n = A->rmap->n; 928 929 PetscFunctionBegin; 930 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 931 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 932 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 933 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 934 935 A->offloadmask = PETSC_OFFLOAD_BOTH; 936 937 /* lower triangular indices */ 938 PetscCall(ISIdentity(ip,&perm_identity)); 939 if (!perm_identity) { 940 IS iip; 941 const PetscInt *irip,*rip; 942 943 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 944 PetscCall(ISGetIndices(iip,&irip)); 945 PetscCall(ISGetIndices(ip,&rip)); 946 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 947 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 948 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 949 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 950 PetscCall(ISRestoreIndices(iip,&irip)); 951 PetscCall(ISDestroy(&iip)); 952 PetscCall(ISRestoreIndices(ip,&rip)); 953 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 954 } 955 PetscFunctionReturn(0); 956 } 957 958 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 959 { 960 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 961 IS ip = b->row; 962 PetscBool perm_identity; 963 964 PetscFunctionBegin; 965 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 966 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 967 B->offloadmask = PETSC_OFFLOAD_CPU; 968 /* determine which version of MatSolve needs to be used. */ 969 PetscCall(ISIdentity(ip,&perm_identity)); 970 if (perm_identity) { 971 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 972 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 973 B->ops->matsolve = NULL; 974 B->ops->matsolvetranspose = NULL; 975 } else { 976 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 977 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 978 B->ops->matsolve = NULL; 979 B->ops->matsolvetranspose = NULL; 980 } 981 982 /* get the triangular factors */ 983 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 984 PetscFunctionReturn(0); 985 } 986 987 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 988 { 989 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 990 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 991 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 992 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 993 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 994 cusparseIndexBase_t indexBase; 995 cusparseMatrixType_t matrixType; 996 cusparseFillMode_t fillMode; 997 cusparseDiagType_t diagType; 998 999 PetscFunctionBegin; 1000 /* allocate space for the transpose of the lower triangular factor */ 1001 PetscCall(PetscNew(&loTriFactorT)); 1002 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1003 1004 /* set the matrix descriptors of the lower triangular factor */ 1005 matrixType = cusparseGetMatType(loTriFactor->descr); 1006 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1007 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1008 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1009 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1010 1011 /* Create the matrix description */ 1012 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1013 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1014 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1015 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1016 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1017 1018 /* set the operation */ 1019 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1020 1021 /* allocate GPU space for the CSC of the lower triangular factor*/ 1022 loTriFactorT->csrMat = new CsrMatrix; 1023 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1024 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1025 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1026 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1027 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1028 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1029 1030 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1031 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1032 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1033 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1034 loTriFactor->csrMat->values->data().get(), 1035 loTriFactor->csrMat->row_offsets->data().get(), 1036 loTriFactor->csrMat->column_indices->data().get(), 1037 loTriFactorT->csrMat->values->data().get(), 1038 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039 CUSPARSE_ACTION_NUMERIC,indexBase, 1040 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1041 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1042 #endif 1043 1044 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1045 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047 loTriFactor->csrMat->values->data().get(), 1048 loTriFactor->csrMat->row_offsets->data().get(), 1049 loTriFactor->csrMat->column_indices->data().get(), 1050 loTriFactorT->csrMat->values->data().get(), 1051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1052 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1053 CUSPARSE_ACTION_NUMERIC, indexBase, 1054 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1055 #else 1056 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1057 CUSPARSE_ACTION_NUMERIC, indexBase)); 1058 #endif 1059 PetscCallCUDA(WaitForCUDA()); 1060 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1061 1062 /* Create the solve analysis information */ 1063 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1064 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1066 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1067 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1068 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1070 &loTriFactorT->solveBufferSize)); 1071 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1072 #endif 1073 1074 /* perform the solve analysis */ 1075 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 loTriFactorT->solveInfo, 1081 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1082 #else 1083 loTriFactorT->solveInfo)); 1084 #endif 1085 PetscCallCUDA(WaitForCUDA()); 1086 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1087 1088 /* assign the pointer */ 1089 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1090 1091 /*********************************************/ 1092 /* Now the Transpose of the Upper Tri Factor */ 1093 /*********************************************/ 1094 1095 /* allocate space for the transpose of the upper triangular factor */ 1096 PetscCall(PetscNew(&upTriFactorT)); 1097 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1098 1099 /* set the matrix descriptors of the upper triangular factor */ 1100 matrixType = cusparseGetMatType(upTriFactor->descr); 1101 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1102 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1103 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1104 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1105 1106 /* Create the matrix description */ 1107 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1108 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1109 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1110 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1111 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1112 1113 /* set the operation */ 1114 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1115 1116 /* allocate GPU space for the CSC of the upper triangular factor*/ 1117 upTriFactorT->csrMat = new CsrMatrix; 1118 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1119 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1120 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1121 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1122 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1123 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1124 1125 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1126 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1127 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1128 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1129 upTriFactor->csrMat->values->data().get(), 1130 upTriFactor->csrMat->row_offsets->data().get(), 1131 upTriFactor->csrMat->column_indices->data().get(), 1132 upTriFactorT->csrMat->values->data().get(), 1133 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1134 CUSPARSE_ACTION_NUMERIC,indexBase, 1135 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1136 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1137 #endif 1138 1139 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1140 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1141 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1142 upTriFactor->csrMat->values->data().get(), 1143 upTriFactor->csrMat->row_offsets->data().get(), 1144 upTriFactor->csrMat->column_indices->data().get(), 1145 upTriFactorT->csrMat->values->data().get(), 1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1148 CUSPARSE_ACTION_NUMERIC, indexBase, 1149 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1150 #else 1151 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1152 CUSPARSE_ACTION_NUMERIC, indexBase)); 1153 #endif 1154 1155 PetscCallCUDA(WaitForCUDA()); 1156 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1157 1158 /* Create the solve analysis information */ 1159 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1160 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 1161 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1162 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1163 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1164 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1165 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1166 &upTriFactorT->solveBufferSize)); 1167 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1168 #endif 1169 1170 /* perform the solve analysis */ 1171 /* christ, would it have killed you to put this stuff in a function????????? */ 1172 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1173 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1174 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1175 upTriFactorT->csrMat->column_indices->data().get(), 1176 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1177 upTriFactorT->solveInfo, 1178 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1179 #else 1180 upTriFactorT->solveInfo)); 1181 #endif 1182 1183 PetscCallCUDA(WaitForCUDA()); 1184 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1185 1186 /* assign the pointer */ 1187 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1188 PetscFunctionReturn(0); 1189 } 1190 1191 struct PetscScalarToPetscInt 1192 { 1193 __host__ __device__ 1194 PetscInt operator()(PetscScalar s) 1195 { 1196 return (PetscInt)PetscRealPart(s); 1197 } 1198 }; 1199 1200 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1201 { 1202 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1203 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1204 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1205 cusparseStatus_t stat; 1206 cusparseIndexBase_t indexBase; 1207 1208 PetscFunctionBegin; 1209 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1210 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1211 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1212 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1213 PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1214 if (A->transupdated) PetscFunctionReturn(0); 1215 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1216 PetscCall(PetscLogGpuTimeBegin()); 1217 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1218 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1219 } 1220 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1221 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1222 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1223 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1224 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1225 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1226 1227 /* set alpha and beta */ 1228 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1229 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1230 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1231 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1232 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1233 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1234 1235 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1236 CsrMatrix *matrixT = new CsrMatrix; 1237 matstructT->mat = matrixT; 1238 matrixT->num_rows = A->cmap->n; 1239 matrixT->num_cols = A->rmap->n; 1240 matrixT->num_entries = a->nz; 1241 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1242 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1243 matrixT->values = new THRUSTARRAY(a->nz); 1244 1245 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1246 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1247 1248 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1249 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1250 stat = cusparseCreateCsr(&matstructT->matDescr, 1251 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1252 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1253 matrixT->values->data().get(), 1254 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1255 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1256 #else 1257 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1258 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1259 1260 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1261 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1262 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1263 */ 1264 if (matrixT->num_entries) { 1265 stat = cusparseCreateCsr(&matstructT->matDescr, 1266 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1267 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1268 matrixT->values->data().get(), 1269 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1270 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1271 1272 } else { 1273 matstructT->matDescr = NULL; 1274 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1275 } 1276 #endif 1277 #endif 1278 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1279 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1280 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1281 #else 1282 CsrMatrix *temp = new CsrMatrix; 1283 CsrMatrix *tempT = new CsrMatrix; 1284 /* First convert HYB to CSR */ 1285 temp->num_rows = A->rmap->n; 1286 temp->num_cols = A->cmap->n; 1287 temp->num_entries = a->nz; 1288 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1290 temp->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_hyb2csr(cusparsestruct->handle, 1293 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1294 temp->values->data().get(), 1295 temp->row_offsets->data().get(), 1296 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1297 1298 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1299 tempT->num_rows = A->rmap->n; 1300 tempT->num_cols = A->cmap->n; 1301 tempT->num_entries = a->nz; 1302 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1303 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1304 tempT->values = new THRUSTARRAY(a->nz); 1305 1306 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1307 temp->num_cols, temp->num_entries, 1308 temp->values->data().get(), 1309 temp->row_offsets->data().get(), 1310 temp->column_indices->data().get(), 1311 tempT->values->data().get(), 1312 tempT->column_indices->data().get(), 1313 tempT->row_offsets->data().get(), 1314 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1315 1316 /* Last, convert CSC to HYB */ 1317 cusparseHybMat_t hybMat; 1318 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1319 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1320 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1321 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1322 matstructT->descr, tempT->values->data().get(), 1323 tempT->row_offsets->data().get(), 1324 tempT->column_indices->data().get(), 1325 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1326 1327 /* assign the pointer */ 1328 matstructT->mat = hybMat; 1329 A->transupdated = PETSC_TRUE; 1330 /* delete temporaries */ 1331 if (tempT) { 1332 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1333 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1334 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1335 delete (CsrMatrix*) tempT; 1336 } 1337 if (temp) { 1338 if (temp->values) delete (THRUSTARRAY*) temp->values; 1339 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1340 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1341 delete (CsrMatrix*) temp; 1342 } 1343 #endif 1344 } 1345 } 1346 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1347 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1348 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1349 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1350 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1351 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1352 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1353 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1354 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1355 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1356 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1357 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1358 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1359 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1360 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1361 } 1362 if (!cusparsestruct->csr2csc_i) { 1363 THRUSTARRAY csr2csc_a(matrix->num_entries); 1364 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1365 1366 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1367 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1368 void *csr2cscBuffer; 1369 size_t csr2cscBufferSize; 1370 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1371 A->cmap->n, matrix->num_entries, 1372 matrix->values->data().get(), 1373 cusparsestruct->rowoffsets_gpu->data().get(), 1374 matrix->column_indices->data().get(), 1375 matrixT->values->data().get(), 1376 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1377 CUSPARSE_ACTION_NUMERIC,indexBase, 1378 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1379 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1380 #endif 1381 1382 if (matrix->num_entries) { 1383 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1384 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1385 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1386 1387 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1388 should be filled with indexBase. So I just take a shortcut here. 1389 */ 1390 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1391 A->cmap->n,matrix->num_entries, 1392 csr2csc_a.data().get(), 1393 cusparsestruct->rowoffsets_gpu->data().get(), 1394 matrix->column_indices->data().get(), 1395 matrixT->values->data().get(), 1396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1398 CUSPARSE_ACTION_NUMERIC,indexBase, 1399 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1400 #else 1401 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1402 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1403 #endif 1404 } else { 1405 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1406 } 1407 1408 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1409 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1410 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1411 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1412 #endif 1413 } 1414 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1415 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1416 matrixT->values->begin())); 1417 } 1418 PetscCall(PetscLogGpuTimeEnd()); 1419 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1420 /* the compressed row indices is not used for matTranspose */ 1421 matstructT->cprowIndices = NULL; 1422 /* assign the pointer */ 1423 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1424 A->transupdated = PETSC_TRUE; 1425 PetscFunctionReturn(0); 1426 } 1427 1428 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1429 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1430 { 1431 PetscInt n = xx->map->n; 1432 const PetscScalar *barray; 1433 PetscScalar *xarray; 1434 thrust::device_ptr<const PetscScalar> bGPU; 1435 thrust::device_ptr<PetscScalar> xGPU; 1436 cusparseStatus_t stat; 1437 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1438 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1439 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1440 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1441 1442 PetscFunctionBegin; 1443 /* Analyze the matrix and create the transpose ... on the fly */ 1444 if (!loTriFactorT && !upTriFactorT) { 1445 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1446 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1447 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1448 } 1449 1450 /* Get the GPU pointers */ 1451 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1452 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1453 xGPU = thrust::device_pointer_cast(xarray); 1454 bGPU = thrust::device_pointer_cast(barray); 1455 1456 PetscCall(PetscLogGpuTimeBegin()); 1457 /* First, reorder with the row permutation */ 1458 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1459 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1460 xGPU); 1461 1462 /* First, solve U */ 1463 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1464 upTriFactorT->csrMat->num_rows, 1465 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1466 upTriFactorT->csrMat->num_entries, 1467 #endif 1468 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1469 upTriFactorT->csrMat->values->data().get(), 1470 upTriFactorT->csrMat->row_offsets->data().get(), 1471 upTriFactorT->csrMat->column_indices->data().get(), 1472 upTriFactorT->solveInfo, 1473 xarray, 1474 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1475 tempGPU->data().get(), 1476 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1477 #else 1478 tempGPU->data().get());PetscCallCUSPARSE(stat); 1479 #endif 1480 1481 /* Then, solve L */ 1482 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1483 loTriFactorT->csrMat->num_rows, 1484 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1485 loTriFactorT->csrMat->num_entries, 1486 #endif 1487 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1488 loTriFactorT->csrMat->values->data().get(), 1489 loTriFactorT->csrMat->row_offsets->data().get(), 1490 loTriFactorT->csrMat->column_indices->data().get(), 1491 loTriFactorT->solveInfo, 1492 tempGPU->data().get(), 1493 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1494 xarray, 1495 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1496 #else 1497 xarray);PetscCallCUSPARSE(stat); 1498 #endif 1499 1500 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1501 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1502 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1503 tempGPU->begin()); 1504 1505 /* Copy the temporary to the full solution. */ 1506 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1507 1508 /* restore */ 1509 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1510 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1511 PetscCall(PetscLogGpuTimeEnd()); 1512 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1513 PetscFunctionReturn(0); 1514 } 1515 1516 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1517 { 1518 const PetscScalar *barray; 1519 PetscScalar *xarray; 1520 cusparseStatus_t stat; 1521 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1522 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1523 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1524 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1525 1526 PetscFunctionBegin; 1527 /* Analyze the matrix and create the transpose ... on the fly */ 1528 if (!loTriFactorT && !upTriFactorT) { 1529 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1530 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1531 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1532 } 1533 1534 /* Get the GPU pointers */ 1535 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1536 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1537 1538 PetscCall(PetscLogGpuTimeBegin()); 1539 /* First, solve U */ 1540 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1541 upTriFactorT->csrMat->num_rows, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 upTriFactorT->csrMat->num_entries, 1544 #endif 1545 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1546 upTriFactorT->csrMat->values->data().get(), 1547 upTriFactorT->csrMat->row_offsets->data().get(), 1548 upTriFactorT->csrMat->column_indices->data().get(), 1549 upTriFactorT->solveInfo, 1550 barray, 1551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1552 tempGPU->data().get(), 1553 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1554 #else 1555 tempGPU->data().get());PetscCallCUSPARSE(stat); 1556 #endif 1557 1558 /* Then, solve L */ 1559 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1560 loTriFactorT->csrMat->num_rows, 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 loTriFactorT->csrMat->num_entries, 1563 #endif 1564 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1565 loTriFactorT->csrMat->values->data().get(), 1566 loTriFactorT->csrMat->row_offsets->data().get(), 1567 loTriFactorT->csrMat->column_indices->data().get(), 1568 loTriFactorT->solveInfo, 1569 tempGPU->data().get(), 1570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571 xarray, 1572 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1573 #else 1574 xarray);PetscCallCUSPARSE(stat); 1575 #endif 1576 1577 /* restore */ 1578 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1579 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1580 PetscCall(PetscLogGpuTimeEnd()); 1581 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1582 PetscFunctionReturn(0); 1583 } 1584 1585 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1586 { 1587 const PetscScalar *barray; 1588 PetscScalar *xarray; 1589 thrust::device_ptr<const PetscScalar> bGPU; 1590 thrust::device_ptr<PetscScalar> xGPU; 1591 cusparseStatus_t stat; 1592 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1594 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1595 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1596 1597 PetscFunctionBegin; 1598 1599 /* Get the GPU pointers */ 1600 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1601 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1602 xGPU = thrust::device_pointer_cast(xarray); 1603 bGPU = thrust::device_pointer_cast(barray); 1604 1605 PetscCall(PetscLogGpuTimeBegin()); 1606 /* First, reorder with the row permutation */ 1607 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1608 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1609 tempGPU->begin()); 1610 1611 /* Next, solve L */ 1612 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1613 loTriFactor->csrMat->num_rows, 1614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1615 loTriFactor->csrMat->num_entries, 1616 #endif 1617 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1618 loTriFactor->csrMat->values->data().get(), 1619 loTriFactor->csrMat->row_offsets->data().get(), 1620 loTriFactor->csrMat->column_indices->data().get(), 1621 loTriFactor->solveInfo, 1622 tempGPU->data().get(), 1623 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1624 xarray, 1625 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1626 #else 1627 xarray);PetscCallCUSPARSE(stat); 1628 #endif 1629 1630 /* Then, solve U */ 1631 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1632 upTriFactor->csrMat->num_rows, 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 upTriFactor->csrMat->num_entries, 1635 #endif 1636 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1637 upTriFactor->csrMat->values->data().get(), 1638 upTriFactor->csrMat->row_offsets->data().get(), 1639 upTriFactor->csrMat->column_indices->data().get(), 1640 upTriFactor->solveInfo,xarray, 1641 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1642 tempGPU->data().get(), 1643 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1644 #else 1645 tempGPU->data().get());PetscCallCUSPARSE(stat); 1646 #endif 1647 1648 /* Last, reorder with the column permutation */ 1649 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1650 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1651 xGPU); 1652 1653 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1654 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1655 PetscCall(PetscLogGpuTimeEnd()); 1656 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1657 PetscFunctionReturn(0); 1658 } 1659 1660 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1661 { 1662 const PetscScalar *barray; 1663 PetscScalar *xarray; 1664 cusparseStatus_t stat; 1665 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1666 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1667 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1668 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1669 1670 PetscFunctionBegin; 1671 /* Get the GPU pointers */ 1672 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1673 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1674 1675 PetscCall(PetscLogGpuTimeBegin()); 1676 /* First, solve L */ 1677 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1678 loTriFactor->csrMat->num_rows, 1679 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1680 loTriFactor->csrMat->num_entries, 1681 #endif 1682 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1683 loTriFactor->csrMat->values->data().get(), 1684 loTriFactor->csrMat->row_offsets->data().get(), 1685 loTriFactor->csrMat->column_indices->data().get(), 1686 loTriFactor->solveInfo, 1687 barray, 1688 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689 tempGPU->data().get(), 1690 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1691 #else 1692 tempGPU->data().get());PetscCallCUSPARSE(stat); 1693 #endif 1694 1695 /* Next, solve U */ 1696 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1697 upTriFactor->csrMat->num_rows, 1698 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1699 upTriFactor->csrMat->num_entries, 1700 #endif 1701 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1702 upTriFactor->csrMat->values->data().get(), 1703 upTriFactor->csrMat->row_offsets->data().get(), 1704 upTriFactor->csrMat->column_indices->data().get(), 1705 upTriFactor->solveInfo, 1706 tempGPU->data().get(), 1707 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1708 xarray, 1709 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1710 #else 1711 xarray);PetscCallCUSPARSE(stat); 1712 #endif 1713 1714 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1715 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1716 PetscCall(PetscLogGpuTimeEnd()); 1717 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1718 PetscFunctionReturn(0); 1719 } 1720 1721 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1722 { 1723 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1724 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1725 1726 PetscFunctionBegin; 1727 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1728 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1729 1730 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1731 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 1732 PetscCallCUDA(WaitForCUDA()); 1733 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 1734 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1735 A->offloadmask = PETSC_OFFLOAD_BOTH; 1736 } 1737 PetscFunctionReturn(0); 1738 } 1739 1740 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1741 { 1742 PetscFunctionBegin; 1743 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1744 *array = ((Mat_SeqAIJ*)A->data)->a; 1745 PetscFunctionReturn(0); 1746 } 1747 1748 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1749 { 1750 PetscFunctionBegin; 1751 A->offloadmask = PETSC_OFFLOAD_CPU; 1752 *array = NULL; 1753 PetscFunctionReturn(0); 1754 } 1755 1756 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1757 { 1758 PetscFunctionBegin; 1759 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1760 *array = ((Mat_SeqAIJ*)A->data)->a; 1761 PetscFunctionReturn(0); 1762 } 1763 1764 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1765 { 1766 PetscFunctionBegin; 1767 *array = NULL; 1768 PetscFunctionReturn(0); 1769 } 1770 1771 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1772 { 1773 PetscFunctionBegin; 1774 *array = ((Mat_SeqAIJ*)A->data)->a; 1775 PetscFunctionReturn(0); 1776 } 1777 1778 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1779 { 1780 PetscFunctionBegin; 1781 A->offloadmask = PETSC_OFFLOAD_CPU; 1782 *array = NULL; 1783 PetscFunctionReturn(0); 1784 } 1785 1786 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1787 { 1788 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1789 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1790 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1791 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1792 cusparseStatus_t stat; 1793 PetscBool both = PETSC_TRUE; 1794 1795 PetscFunctionBegin; 1796 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1797 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1798 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1799 CsrMatrix *matrix; 1800 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1801 1802 PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1803 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1804 matrix->values->assign(a->a, a->a+a->nz); 1805 PetscCallCUDA(WaitForCUDA()); 1806 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 1807 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1808 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 1809 } else { 1810 PetscInt nnz; 1811 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1812 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 1813 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1814 delete cusparsestruct->workVector; 1815 delete cusparsestruct->rowoffsets_gpu; 1816 cusparsestruct->workVector = NULL; 1817 cusparsestruct->rowoffsets_gpu = NULL; 1818 try { 1819 if (a->compressedrow.use) { 1820 m = a->compressedrow.nrows; 1821 ii = a->compressedrow.i; 1822 ridx = a->compressedrow.rindex; 1823 } else { 1824 m = A->rmap->n; 1825 ii = a->i; 1826 ridx = NULL; 1827 } 1828 PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1829 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1830 else nnz = a->nz; 1831 PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1832 1833 /* create cusparse matrix */ 1834 cusparsestruct->nrows = m; 1835 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1836 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 1837 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 1838 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1839 1840 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 1841 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 1842 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 1843 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1844 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1845 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1846 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1847 1848 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1849 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1850 /* set the matrix */ 1851 CsrMatrix *mat= new CsrMatrix; 1852 mat->num_rows = m; 1853 mat->num_cols = A->cmap->n; 1854 mat->num_entries = nnz; 1855 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1856 mat->row_offsets->assign(ii, ii + m+1); 1857 1858 mat->column_indices = new THRUSTINTARRAY32(nnz); 1859 mat->column_indices->assign(a->j, a->j+nnz); 1860 1861 mat->values = new THRUSTARRAY(nnz); 1862 if (a->a) mat->values->assign(a->a, a->a+nnz); 1863 1864 /* assign the pointer */ 1865 matstruct->mat = mat; 1866 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1867 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1868 stat = cusparseCreateCsr(&matstruct->matDescr, 1869 mat->num_rows, mat->num_cols, mat->num_entries, 1870 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1871 mat->values->data().get(), 1872 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1873 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1874 } 1875 #endif 1876 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1877 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1878 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1879 #else 1880 CsrMatrix *mat= new CsrMatrix; 1881 mat->num_rows = m; 1882 mat->num_cols = A->cmap->n; 1883 mat->num_entries = nnz; 1884 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1885 mat->row_offsets->assign(ii, ii + m+1); 1886 1887 mat->column_indices = new THRUSTINTARRAY32(nnz); 1888 mat->column_indices->assign(a->j, a->j+nnz); 1889 1890 mat->values = new THRUSTARRAY(nnz); 1891 if (a->a) mat->values->assign(a->a, a->a+nnz); 1892 1893 cusparseHybMat_t hybMat; 1894 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1895 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1896 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1897 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1898 matstruct->descr, mat->values->data().get(), 1899 mat->row_offsets->data().get(), 1900 mat->column_indices->data().get(), 1901 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1902 /* assign the pointer */ 1903 matstruct->mat = hybMat; 1904 1905 if (mat) { 1906 if (mat->values) delete (THRUSTARRAY*)mat->values; 1907 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1908 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1909 delete (CsrMatrix*)mat; 1910 } 1911 #endif 1912 } 1913 1914 /* assign the compressed row indices */ 1915 if (a->compressedrow.use) { 1916 cusparsestruct->workVector = new THRUSTARRAY(m); 1917 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1918 matstruct->cprowIndices->assign(ridx,ridx+m); 1919 tmp = m; 1920 } else { 1921 cusparsestruct->workVector = NULL; 1922 matstruct->cprowIndices = NULL; 1923 tmp = 0; 1924 } 1925 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1926 1927 /* assign the pointer */ 1928 cusparsestruct->mat = matstruct; 1929 } catch(char *ex) { 1930 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1931 } 1932 PetscCallCUDA(WaitForCUDA()); 1933 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1934 cusparsestruct->nonzerostate = A->nonzerostate; 1935 } 1936 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1937 } 1938 PetscFunctionReturn(0); 1939 } 1940 1941 struct VecCUDAPlusEquals 1942 { 1943 template <typename Tuple> 1944 __host__ __device__ 1945 void operator()(Tuple t) 1946 { 1947 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1948 } 1949 }; 1950 1951 struct VecCUDAEquals 1952 { 1953 template <typename Tuple> 1954 __host__ __device__ 1955 void operator()(Tuple t) 1956 { 1957 thrust::get<1>(t) = thrust::get<0>(t); 1958 } 1959 }; 1960 1961 struct VecCUDAEqualsReverse 1962 { 1963 template <typename Tuple> 1964 __host__ __device__ 1965 void operator()(Tuple t) 1966 { 1967 thrust::get<0>(t) = thrust::get<1>(t); 1968 } 1969 }; 1970 1971 struct MatMatCusparse { 1972 PetscBool cisdense; 1973 PetscScalar *Bt; 1974 Mat X; 1975 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1976 PetscLogDouble flops; 1977 CsrMatrix *Bcsr; 1978 1979 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1980 cusparseSpMatDescr_t matSpBDescr; 1981 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1982 cusparseDnMatDescr_t matBDescr; 1983 cusparseDnMatDescr_t matCDescr; 1984 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1985 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1986 void *dBuffer4; 1987 void *dBuffer5; 1988 #endif 1989 size_t mmBufferSize; 1990 void *mmBuffer; 1991 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1992 cusparseSpGEMMDescr_t spgemmDesc; 1993 #endif 1994 }; 1995 1996 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1997 { 1998 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1999 2000 PetscFunctionBegin; 2001 PetscCallCUDA(cudaFree(mmdata->Bt)); 2002 delete mmdata->Bcsr; 2003 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2004 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2005 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2006 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2007 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2008 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2009 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2010 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2011 #endif 2012 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2013 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2014 #endif 2015 PetscCall(MatDestroy(&mmdata->X)); 2016 PetscCall(PetscFree(data)); 2017 PetscFunctionReturn(0); 2018 } 2019 2020 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2021 2022 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2023 { 2024 Mat_Product *product = C->product; 2025 Mat A,B; 2026 PetscInt m,n,blda,clda; 2027 PetscBool flg,biscuda; 2028 Mat_SeqAIJCUSPARSE *cusp; 2029 cusparseStatus_t stat; 2030 cusparseOperation_t opA; 2031 const PetscScalar *barray; 2032 PetscScalar *carray; 2033 MatMatCusparse *mmdata; 2034 Mat_SeqAIJCUSPARSEMultStruct *mat; 2035 CsrMatrix *csrmat; 2036 2037 PetscFunctionBegin; 2038 MatCheckProduct(C,1); 2039 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2040 mmdata = (MatMatCusparse*)product->data; 2041 A = product->A; 2042 B = product->B; 2043 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2044 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2045 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2046 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2047 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2048 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2049 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2050 switch (product->type) { 2051 case MATPRODUCT_AB: 2052 case MATPRODUCT_PtAP: 2053 mat = cusp->mat; 2054 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2055 m = A->rmap->n; 2056 n = B->cmap->n; 2057 break; 2058 case MATPRODUCT_AtB: 2059 if (!A->form_explicit_transpose) { 2060 mat = cusp->mat; 2061 opA = CUSPARSE_OPERATION_TRANSPOSE; 2062 } else { 2063 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2064 mat = cusp->matTranspose; 2065 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2066 } 2067 m = A->cmap->n; 2068 n = B->cmap->n; 2069 break; 2070 case MATPRODUCT_ABt: 2071 case MATPRODUCT_RARt: 2072 mat = cusp->mat; 2073 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2074 m = A->rmap->n; 2075 n = B->rmap->n; 2076 break; 2077 default: 2078 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2079 } 2080 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2081 csrmat = (CsrMatrix*)mat->mat; 2082 /* if the user passed a CPU matrix, copy the data to the GPU */ 2083 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2084 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2085 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2086 2087 PetscCall(MatDenseGetLDA(B,&blda)); 2088 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2089 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2090 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2091 } else { 2092 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2093 PetscCall(MatDenseGetLDA(C,&clda)); 2094 } 2095 2096 PetscCall(PetscLogGpuTimeBegin()); 2097 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2098 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2099 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2100 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2101 size_t mmBufferSize; 2102 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2103 if (!mmdata->matBDescr) { 2104 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2105 mmdata->Blda = blda; 2106 } 2107 2108 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2109 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2110 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2111 mmdata->Clda = clda; 2112 } 2113 2114 if (!mat->matDescr) { 2115 stat = cusparseCreateCsr(&mat->matDescr, 2116 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2117 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2118 csrmat->values->data().get(), 2119 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2120 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2121 } 2122 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2123 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2124 mmdata->matCDescr,cusparse_scalartype, 2125 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2126 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2127 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2128 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2129 mmdata->mmBufferSize = mmBufferSize; 2130 } 2131 mmdata->initialized = PETSC_TRUE; 2132 } else { 2133 /* to be safe, always update pointers of the mats */ 2134 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2135 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2136 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2137 } 2138 2139 /* do cusparseSpMM, which supports transpose on B */ 2140 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2141 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2142 mmdata->matCDescr,cusparse_scalartype, 2143 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2144 #else 2145 PetscInt k; 2146 /* cusparseXcsrmm does not support transpose on B */ 2147 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2148 cublasHandle_t cublasv2handle; 2149 cublasStatus_t cerr; 2150 2151 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2152 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2153 B->cmap->n,B->rmap->n, 2154 &PETSC_CUSPARSE_ONE ,barray,blda, 2155 &PETSC_CUSPARSE_ZERO,barray,blda, 2156 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2157 blda = B->cmap->n; 2158 k = B->cmap->n; 2159 } else { 2160 k = B->rmap->n; 2161 } 2162 2163 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2164 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2165 csrmat->num_entries,mat->alpha_one,mat->descr, 2166 csrmat->values->data().get(), 2167 csrmat->row_offsets->data().get(), 2168 csrmat->column_indices->data().get(), 2169 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2170 carray,clda);PetscCallCUSPARSE(stat); 2171 #endif 2172 PetscCall(PetscLogGpuTimeEnd()); 2173 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2174 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2175 if (product->type == MATPRODUCT_RARt) { 2176 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2177 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2178 } else if (product->type == MATPRODUCT_PtAP) { 2179 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2180 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2181 } else { 2182 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2183 } 2184 if (mmdata->cisdense) { 2185 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2186 } 2187 if (!biscuda) { 2188 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2189 } 2190 PetscFunctionReturn(0); 2191 } 2192 2193 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2194 { 2195 Mat_Product *product = C->product; 2196 Mat A,B; 2197 PetscInt m,n; 2198 PetscBool cisdense,flg; 2199 MatMatCusparse *mmdata; 2200 Mat_SeqAIJCUSPARSE *cusp; 2201 2202 PetscFunctionBegin; 2203 MatCheckProduct(C,1); 2204 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2205 A = product->A; 2206 B = product->B; 2207 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2208 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2209 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2210 PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2211 switch (product->type) { 2212 case MATPRODUCT_AB: 2213 m = A->rmap->n; 2214 n = B->cmap->n; 2215 break; 2216 case MATPRODUCT_AtB: 2217 m = A->cmap->n; 2218 n = B->cmap->n; 2219 break; 2220 case MATPRODUCT_ABt: 2221 m = A->rmap->n; 2222 n = B->rmap->n; 2223 break; 2224 case MATPRODUCT_PtAP: 2225 m = B->cmap->n; 2226 n = B->cmap->n; 2227 break; 2228 case MATPRODUCT_RARt: 2229 m = B->rmap->n; 2230 n = B->rmap->n; 2231 break; 2232 default: 2233 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2234 } 2235 PetscCall(MatSetSizes(C,m,n,m,n)); 2236 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2237 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 2238 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2239 2240 /* product data */ 2241 PetscCall(PetscNew(&mmdata)); 2242 mmdata->cisdense = cisdense; 2243 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2244 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2245 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2246 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2247 } 2248 #endif 2249 /* for these products we need intermediate storage */ 2250 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2251 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 2252 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2253 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2254 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2255 } else { 2256 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2257 } 2258 } 2259 C->product->data = mmdata; 2260 C->product->destroy = MatDestroy_MatMatCusparse; 2261 2262 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2263 PetscFunctionReturn(0); 2264 } 2265 2266 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2267 { 2268 Mat_Product *product = C->product; 2269 Mat A,B; 2270 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2271 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2272 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2273 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2274 PetscBool flg; 2275 cusparseStatus_t stat; 2276 MatProductType ptype; 2277 MatMatCusparse *mmdata; 2278 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2279 cusparseSpMatDescr_t BmatSpDescr; 2280 #endif 2281 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2282 2283 PetscFunctionBegin; 2284 MatCheckProduct(C,1); 2285 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2286 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 2287 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2288 mmdata = (MatMatCusparse*)C->product->data; 2289 A = product->A; 2290 B = product->B; 2291 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2292 mmdata->reusesym = PETSC_FALSE; 2293 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2294 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2295 Cmat = Ccusp->mat; 2296 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2297 Ccsr = (CsrMatrix*)Cmat->mat; 2298 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2299 goto finalize; 2300 } 2301 if (!c->nz) goto finalize; 2302 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2303 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2304 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2305 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2306 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2307 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2308 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2309 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2310 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2311 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2312 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2313 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2314 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2315 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2316 2317 ptype = product->type; 2318 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2319 ptype = MATPRODUCT_AB; 2320 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2321 } 2322 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2323 ptype = MATPRODUCT_AB; 2324 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2325 } 2326 switch (ptype) { 2327 case MATPRODUCT_AB: 2328 Amat = Acusp->mat; 2329 Bmat = Bcusp->mat; 2330 break; 2331 case MATPRODUCT_AtB: 2332 Amat = Acusp->matTranspose; 2333 Bmat = Bcusp->mat; 2334 break; 2335 case MATPRODUCT_ABt: 2336 Amat = Acusp->mat; 2337 Bmat = Bcusp->matTranspose; 2338 break; 2339 default: 2340 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2341 } 2342 Cmat = Ccusp->mat; 2343 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2344 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2345 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2346 Acsr = (CsrMatrix*)Amat->mat; 2347 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2348 Ccsr = (CsrMatrix*)Cmat->mat; 2349 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2350 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2351 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2352 PetscCall(PetscLogGpuTimeBegin()); 2353 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2354 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2355 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2356 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2357 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2358 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2359 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2360 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2361 #else 2362 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2363 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2364 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2365 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2366 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2367 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2368 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2369 #endif 2370 #else 2371 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2372 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2373 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2374 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2375 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2376 #endif 2377 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2378 PetscCallCUDA(WaitForCUDA()); 2379 PetscCall(PetscLogGpuTimeEnd()); 2380 C->offloadmask = PETSC_OFFLOAD_GPU; 2381 finalize: 2382 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2383 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 2384 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 2385 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2386 c->reallocs = 0; 2387 C->info.mallocs += 0; 2388 C->info.nz_unneeded = 0; 2389 C->assembled = C->was_assembled = PETSC_TRUE; 2390 C->num_ass++; 2391 PetscFunctionReturn(0); 2392 } 2393 2394 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2395 { 2396 Mat_Product *product = C->product; 2397 Mat A,B; 2398 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2399 Mat_SeqAIJ *a,*b,*c; 2400 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2401 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2402 PetscInt i,j,m,n,k; 2403 PetscBool flg; 2404 cusparseStatus_t stat; 2405 MatProductType ptype; 2406 MatMatCusparse *mmdata; 2407 PetscLogDouble flops; 2408 PetscBool biscompressed,ciscompressed; 2409 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2410 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2411 cusparseSpMatDescr_t BmatSpDescr; 2412 #else 2413 int cnz; 2414 #endif 2415 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2416 2417 PetscFunctionBegin; 2418 MatCheckProduct(C,1); 2419 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2420 A = product->A; 2421 B = product->B; 2422 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2423 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2424 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2425 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2426 a = (Mat_SeqAIJ*)A->data; 2427 b = (Mat_SeqAIJ*)B->data; 2428 /* product data */ 2429 PetscCall(PetscNew(&mmdata)); 2430 C->product->data = mmdata; 2431 C->product->destroy = MatDestroy_MatMatCusparse; 2432 2433 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2434 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2435 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2436 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2437 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2438 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2439 2440 ptype = product->type; 2441 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2442 ptype = MATPRODUCT_AB; 2443 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2444 } 2445 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2446 ptype = MATPRODUCT_AB; 2447 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2448 } 2449 biscompressed = PETSC_FALSE; 2450 ciscompressed = PETSC_FALSE; 2451 switch (ptype) { 2452 case MATPRODUCT_AB: 2453 m = A->rmap->n; 2454 n = B->cmap->n; 2455 k = A->cmap->n; 2456 Amat = Acusp->mat; 2457 Bmat = Bcusp->mat; 2458 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2459 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2460 break; 2461 case MATPRODUCT_AtB: 2462 m = A->cmap->n; 2463 n = B->cmap->n; 2464 k = A->rmap->n; 2465 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2466 Amat = Acusp->matTranspose; 2467 Bmat = Bcusp->mat; 2468 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2469 break; 2470 case MATPRODUCT_ABt: 2471 m = A->rmap->n; 2472 n = B->rmap->n; 2473 k = A->cmap->n; 2474 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2475 Amat = Acusp->mat; 2476 Bmat = Bcusp->matTranspose; 2477 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2478 break; 2479 default: 2480 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2481 } 2482 2483 /* create cusparse matrix */ 2484 PetscCall(MatSetSizes(C,m,n,m,n)); 2485 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2486 c = (Mat_SeqAIJ*)C->data; 2487 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2488 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2489 Ccsr = new CsrMatrix; 2490 2491 c->compressedrow.use = ciscompressed; 2492 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2493 c->compressedrow.nrows = a->compressedrow.nrows; 2494 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 2495 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2496 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2497 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2498 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2499 } else { 2500 c->compressedrow.nrows = 0; 2501 c->compressedrow.i = NULL; 2502 c->compressedrow.rindex = NULL; 2503 Ccusp->workVector = NULL; 2504 Cmat->cprowIndices = NULL; 2505 } 2506 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2507 Ccusp->mat = Cmat; 2508 Ccusp->mat->mat = Ccsr; 2509 Ccsr->num_rows = Ccusp->nrows; 2510 Ccsr->num_cols = n; 2511 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2512 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2513 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2514 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2515 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 2516 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 2517 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2518 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2519 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2520 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2521 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2522 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2523 c->nz = 0; 2524 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2525 Ccsr->values = new THRUSTARRAY(c->nz); 2526 goto finalizesym; 2527 } 2528 2529 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2530 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2531 Acsr = (CsrMatrix*)Amat->mat; 2532 if (!biscompressed) { 2533 Bcsr = (CsrMatrix*)Bmat->mat; 2534 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2535 BmatSpDescr = Bmat->matDescr; 2536 #endif 2537 } else { /* we need to use row offsets for the full matrix */ 2538 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2539 Bcsr = new CsrMatrix; 2540 Bcsr->num_rows = B->rmap->n; 2541 Bcsr->num_cols = cBcsr->num_cols; 2542 Bcsr->num_entries = cBcsr->num_entries; 2543 Bcsr->column_indices = cBcsr->column_indices; 2544 Bcsr->values = cBcsr->values; 2545 if (!Bcusp->rowoffsets_gpu) { 2546 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2547 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2548 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2549 } 2550 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2551 mmdata->Bcsr = Bcsr; 2552 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2553 if (Bcsr->num_rows && Bcsr->num_cols) { 2554 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2555 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2556 Bcsr->values->data().get(), 2557 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2558 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2559 } 2560 BmatSpDescr = mmdata->matSpBDescr; 2561 #endif 2562 } 2563 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2564 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2565 /* precompute flops count */ 2566 if (ptype == MATPRODUCT_AB) { 2567 for (i=0, flops = 0; i<A->rmap->n; i++) { 2568 const PetscInt st = a->i[i]; 2569 const PetscInt en = a->i[i+1]; 2570 for (j=st; j<en; j++) { 2571 const PetscInt brow = a->j[j]; 2572 flops += 2.*(b->i[brow+1] - b->i[brow]); 2573 } 2574 } 2575 } else if (ptype == MATPRODUCT_AtB) { 2576 for (i=0, flops = 0; i<A->rmap->n; i++) { 2577 const PetscInt anzi = a->i[i+1] - a->i[i]; 2578 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2579 flops += (2.*anzi)*bnzi; 2580 } 2581 } else { /* TODO */ 2582 flops = 0.; 2583 } 2584 2585 mmdata->flops = flops; 2586 PetscCall(PetscLogGpuTimeBegin()); 2587 2588 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2589 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2590 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2591 NULL, NULL, NULL, 2592 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2593 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2594 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2595 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2596 { 2597 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2598 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2599 */ 2600 void* dBuffer1 = NULL; 2601 void* dBuffer2 = NULL; 2602 void* dBuffer3 = NULL; 2603 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2604 size_t bufferSize1 = 0; 2605 size_t bufferSize2 = 0; 2606 size_t bufferSize3 = 0; 2607 size_t bufferSize4 = 0; 2608 size_t bufferSize5 = 0; 2609 2610 /*----------------------------------------------------------------------*/ 2611 /* ask bufferSize1 bytes for external memory */ 2612 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2613 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2614 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 2615 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2616 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2617 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2618 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2619 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2620 2621 /*----------------------------------------------------------------------*/ 2622 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2623 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2624 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 2625 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 2626 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 2627 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2628 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2629 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2630 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 2631 PetscCallCUDA(cudaFree(dBuffer1)); 2632 PetscCallCUDA(cudaFree(dBuffer2)); 2633 2634 /*----------------------------------------------------------------------*/ 2635 /* get matrix C non-zero entries C_nnz1 */ 2636 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2637 c->nz = (PetscInt) C_nnz1; 2638 /* allocate matrix C */ 2639 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2640 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2641 /* update matC with the new pointers */ 2642 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2643 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2644 2645 /*----------------------------------------------------------------------*/ 2646 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2647 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2648 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 2649 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2650 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2651 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2652 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 2653 PetscCallCUDA(cudaFree(dBuffer3)); 2654 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2655 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2656 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2657 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2658 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2659 } 2660 #else 2661 size_t bufSize2; 2662 /* ask bufferSize bytes for external memory */ 2663 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2664 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2665 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2666 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 2667 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2668 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2669 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2670 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2671 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2672 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2673 /* ask bufferSize again bytes for external memory */ 2674 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2675 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2676 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2677 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2678 /* The CUSPARSE documentation is not clear, nor the API 2679 We need both buffers to perform the operations properly! 2680 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2681 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2682 is stored in the descriptor! What a messy API... */ 2683 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2684 /* compute the intermediate product of A * B */ 2685 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2686 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2687 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2688 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2689 /* get matrix C non-zero entries C_nnz1 */ 2690 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2691 c->nz = (PetscInt) C_nnz1; 2692 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2693 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2694 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2695 Ccsr->values = new THRUSTARRAY(c->nz); 2696 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2697 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2698 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2699 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2700 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2701 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2702 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2703 #else 2704 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2705 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2706 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2707 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2708 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2709 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2710 c->nz = cnz; 2711 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2712 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2713 Ccsr->values = new THRUSTARRAY(c->nz); 2714 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2715 2716 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2717 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2718 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2719 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2720 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2721 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2722 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2723 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2724 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2725 #endif 2726 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2727 PetscCall(PetscLogGpuTimeEnd()); 2728 finalizesym: 2729 c->singlemalloc = PETSC_FALSE; 2730 c->free_a = PETSC_TRUE; 2731 c->free_ij = PETSC_TRUE; 2732 PetscCall(PetscMalloc1(m+1,&c->i)); 2733 PetscCall(PetscMalloc1(c->nz,&c->j)); 2734 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2735 PetscInt *d_i = c->i; 2736 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2737 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2738 ii = *Ccsr->row_offsets; 2739 jj = *Ccsr->column_indices; 2740 if (ciscompressed) d_i = c->compressedrow.i; 2741 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2742 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2743 } else { 2744 PetscInt *d_i = c->i; 2745 if (ciscompressed) d_i = c->compressedrow.i; 2746 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2747 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2748 } 2749 if (ciscompressed) { /* need to expand host row offsets */ 2750 PetscInt r = 0; 2751 c->i[0] = 0; 2752 for (k = 0; k < c->compressedrow.nrows; k++) { 2753 const PetscInt next = c->compressedrow.rindex[k]; 2754 const PetscInt old = c->compressedrow.i[k]; 2755 for (; r < next; r++) c->i[r+1] = old; 2756 } 2757 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2758 } 2759 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 2760 PetscCall(PetscMalloc1(m,&c->ilen)); 2761 PetscCall(PetscMalloc1(m,&c->imax)); 2762 c->maxnz = c->nz; 2763 c->nonzerorowcnt = 0; 2764 c->rmax = 0; 2765 for (k = 0; k < m; k++) { 2766 const PetscInt nn = c->i[k+1] - c->i[k]; 2767 c->ilen[k] = c->imax[k] = nn; 2768 c->nonzerorowcnt += (PetscInt)!!nn; 2769 c->rmax = PetscMax(c->rmax,nn); 2770 } 2771 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 2772 PetscCall(PetscMalloc1(c->nz,&c->a)); 2773 Ccsr->num_entries = c->nz; 2774 2775 C->nonzerostate++; 2776 PetscCall(PetscLayoutSetUp(C->rmap)); 2777 PetscCall(PetscLayoutSetUp(C->cmap)); 2778 Ccusp->nonzerostate = C->nonzerostate; 2779 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2780 C->preallocated = PETSC_TRUE; 2781 C->assembled = PETSC_FALSE; 2782 C->was_assembled = PETSC_FALSE; 2783 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2784 mmdata->reusesym = PETSC_TRUE; 2785 C->offloadmask = PETSC_OFFLOAD_GPU; 2786 } 2787 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2788 PetscFunctionReturn(0); 2789 } 2790 2791 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2792 2793 /* handles sparse or dense B */ 2794 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2795 { 2796 Mat_Product *product = mat->product; 2797 PetscErrorCode ierr; 2798 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2799 2800 PetscFunctionBegin; 2801 MatCheckProduct(mat,1); 2802 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2803 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2804 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2805 } 2806 if (product->type == MATPRODUCT_ABC) { 2807 Ciscusp = PETSC_FALSE; 2808 if (!product->C->boundtocpu) { 2809 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2810 } 2811 } 2812 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2813 PetscBool usecpu = PETSC_FALSE; 2814 switch (product->type) { 2815 case MATPRODUCT_AB: 2816 if (product->api_user) { 2817 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");PetscCall(ierr); 2818 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2819 ierr = PetscOptionsEnd();PetscCall(ierr); 2820 } else { 2821 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");PetscCall(ierr); 2822 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2823 ierr = PetscOptionsEnd();PetscCall(ierr); 2824 } 2825 break; 2826 case MATPRODUCT_AtB: 2827 if (product->api_user) { 2828 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");PetscCall(ierr); 2829 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2830 ierr = PetscOptionsEnd();PetscCall(ierr); 2831 } else { 2832 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");PetscCall(ierr); 2833 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2834 ierr = PetscOptionsEnd();PetscCall(ierr); 2835 } 2836 break; 2837 case MATPRODUCT_PtAP: 2838 if (product->api_user) { 2839 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");PetscCall(ierr); 2840 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2841 ierr = PetscOptionsEnd();PetscCall(ierr); 2842 } else { 2843 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");PetscCall(ierr); 2844 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2845 ierr = PetscOptionsEnd();PetscCall(ierr); 2846 } 2847 break; 2848 case MATPRODUCT_RARt: 2849 if (product->api_user) { 2850 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");PetscCall(ierr); 2851 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2852 ierr = PetscOptionsEnd();PetscCall(ierr); 2853 } else { 2854 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");PetscCall(ierr); 2855 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2856 ierr = PetscOptionsEnd();PetscCall(ierr); 2857 } 2858 break; 2859 case MATPRODUCT_ABC: 2860 if (product->api_user) { 2861 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");PetscCall(ierr); 2862 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2863 ierr = PetscOptionsEnd();PetscCall(ierr); 2864 } else { 2865 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");PetscCall(ierr); 2866 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2867 ierr = PetscOptionsEnd();PetscCall(ierr); 2868 } 2869 break; 2870 default: 2871 break; 2872 } 2873 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2874 } 2875 /* dispatch */ 2876 if (isdense) { 2877 switch (product->type) { 2878 case MATPRODUCT_AB: 2879 case MATPRODUCT_AtB: 2880 case MATPRODUCT_ABt: 2881 case MATPRODUCT_PtAP: 2882 case MATPRODUCT_RARt: 2883 if (product->A->boundtocpu) { 2884 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2885 } else { 2886 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2887 } 2888 break; 2889 case MATPRODUCT_ABC: 2890 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2891 break; 2892 default: 2893 break; 2894 } 2895 } else if (Biscusp && Ciscusp) { 2896 switch (product->type) { 2897 case MATPRODUCT_AB: 2898 case MATPRODUCT_AtB: 2899 case MATPRODUCT_ABt: 2900 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2901 break; 2902 case MATPRODUCT_PtAP: 2903 case MATPRODUCT_RARt: 2904 case MATPRODUCT_ABC: 2905 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2906 break; 2907 default: 2908 break; 2909 } 2910 } else { /* fallback for AIJ */ 2911 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2912 } 2913 PetscFunctionReturn(0); 2914 } 2915 2916 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2917 { 2918 PetscFunctionBegin; 2919 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2920 PetscFunctionReturn(0); 2921 } 2922 2923 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2924 { 2925 PetscFunctionBegin; 2926 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2927 PetscFunctionReturn(0); 2928 } 2929 2930 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2931 { 2932 PetscFunctionBegin; 2933 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2934 PetscFunctionReturn(0); 2935 } 2936 2937 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2938 { 2939 PetscFunctionBegin; 2940 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 2941 PetscFunctionReturn(0); 2942 } 2943 2944 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2945 { 2946 PetscFunctionBegin; 2947 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2948 PetscFunctionReturn(0); 2949 } 2950 2951 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2952 { 2953 int i = blockIdx.x*blockDim.x + threadIdx.x; 2954 if (i < n) y[idx[i]] += x[i]; 2955 } 2956 2957 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2958 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2959 { 2960 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2961 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2962 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2963 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2964 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2965 PetscBool compressed; 2966 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2967 PetscInt nx,ny; 2968 #endif 2969 2970 PetscFunctionBegin; 2971 PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2972 if (!a->nz) { 2973 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 2974 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 2975 PetscFunctionReturn(0); 2976 } 2977 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2978 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2979 if (!trans) { 2980 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2981 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2982 } else { 2983 if (herm || !A->form_explicit_transpose) { 2984 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2985 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2986 } else { 2987 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2988 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2989 } 2990 } 2991 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2992 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2993 2994 try { 2995 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 2996 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 2997 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 2998 2999 PetscCall(PetscLogGpuTimeBegin()); 3000 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3001 /* z = A x + beta y. 3002 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3003 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3004 */ 3005 xptr = xarray; 3006 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3007 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3008 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3009 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3010 allocated to accommodate different uses. So we get the length info directly from mat. 3011 */ 3012 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3013 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3014 nx = mat->num_cols; 3015 ny = mat->num_rows; 3016 } 3017 #endif 3018 } else { 3019 /* z = A^T x + beta y 3020 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3021 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3022 */ 3023 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3024 dptr = zarray; 3025 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3026 if (compressed) { /* Scatter x to work vector */ 3027 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3028 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3029 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3030 VecCUDAEqualsReverse()); 3031 } 3032 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3033 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3034 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3035 nx = mat->num_rows; 3036 ny = mat->num_cols; 3037 } 3038 #endif 3039 } 3040 3041 /* csr_spmv does y = alpha op(A) x + beta y */ 3042 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3043 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3044 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3045 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3046 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3047 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3048 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3049 matstruct->matDescr, 3050 matstruct->cuSpMV[opA].vecXDescr, beta, 3051 matstruct->cuSpMV[opA].vecYDescr, 3052 cusparse_scalartype, 3053 cusparsestruct->spmvAlg, 3054 &matstruct->cuSpMV[opA].spmvBufferSize)); 3055 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3056 3057 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3058 } else { 3059 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3060 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3061 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3062 } 3063 3064 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3065 matstruct->alpha_one, 3066 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3067 matstruct->cuSpMV[opA].vecXDescr, 3068 beta, 3069 matstruct->cuSpMV[opA].vecYDescr, 3070 cusparse_scalartype, 3071 cusparsestruct->spmvAlg, 3072 matstruct->cuSpMV[opA].spmvBuffer)); 3073 #else 3074 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3075 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3076 mat->num_rows, mat->num_cols, 3077 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3078 mat->values->data().get(), mat->row_offsets->data().get(), 3079 mat->column_indices->data().get(), xptr, beta, 3080 dptr)); 3081 #endif 3082 } else { 3083 if (cusparsestruct->nrows) { 3084 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3085 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3086 #else 3087 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3088 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3089 matstruct->alpha_one, matstruct->descr, hybMat, 3090 xptr, beta, 3091 dptr)); 3092 #endif 3093 } 3094 } 3095 PetscCall(PetscLogGpuTimeEnd()); 3096 3097 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3098 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3099 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3100 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3101 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3102 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3103 } 3104 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3105 PetscCall(VecSet_SeqCUDA(zz,0)); 3106 } 3107 3108 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3109 if (compressed) { 3110 PetscCall(PetscLogGpuTimeBegin()); 3111 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3112 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3113 prevent that. So I just add a ScatterAdd kernel. 3114 */ 3115 #if 0 3116 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3117 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3118 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3119 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3120 VecCUDAPlusEquals()); 3121 #else 3122 PetscInt n = matstruct->cprowIndices->size(); 3123 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3124 #endif 3125 PetscCall(PetscLogGpuTimeEnd()); 3126 } 3127 } else { 3128 if (yy && yy != zz) { 3129 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3130 } 3131 } 3132 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3133 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3134 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3135 } catch(char *ex) { 3136 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3137 } 3138 if (yy) { 3139 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3140 } else { 3141 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3142 } 3143 PetscFunctionReturn(0); 3144 } 3145 3146 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3147 { 3148 PetscFunctionBegin; 3149 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3150 PetscFunctionReturn(0); 3151 } 3152 3153 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3154 { 3155 PetscObjectState onnz = A->nonzerostate; 3156 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3157 3158 PetscFunctionBegin; 3159 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3160 if (onnz != A->nonzerostate && cusp->deviceMat) { 3161 3162 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3163 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3164 cusp->deviceMat = NULL; 3165 } 3166 PetscFunctionReturn(0); 3167 } 3168 3169 /* --------------------------------------------------------------------------------*/ 3170 /*@ 3171 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3172 (the default parallel PETSc format). This matrix will ultimately pushed down 3173 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3174 assembly performance the user should preallocate the matrix storage by setting 3175 the parameter nz (or the array nnz). By setting these parameters accurately, 3176 performance during matrix assembly can be increased by more than a factor of 50. 3177 3178 Collective 3179 3180 Input Parameters: 3181 + comm - MPI communicator, set to PETSC_COMM_SELF 3182 . m - number of rows 3183 . n - number of columns 3184 . nz - number of nonzeros per row (same for all rows) 3185 - nnz - array containing the number of nonzeros in the various rows 3186 (possibly different for each row) or NULL 3187 3188 Output Parameter: 3189 . A - the matrix 3190 3191 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3192 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3193 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3194 3195 Notes: 3196 If nnz is given then nz is ignored 3197 3198 The AIJ format (also called the Yale sparse matrix format or 3199 compressed row storage), is fully compatible with standard Fortran 77 3200 storage. That is, the stored row and column indices can begin at 3201 either one (as in Fortran) or zero. See the users' manual for details. 3202 3203 Specify the preallocated storage with either nz or nnz (not both). 3204 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3205 allocation. For large problems you MUST preallocate memory or you 3206 will get TERRIBLE performance, see the users' manual chapter on matrices. 3207 3208 By default, this format uses inodes (identical nodes) when possible, to 3209 improve numerical efficiency of matrix-vector products and solves. We 3210 search for consecutive rows with the same nonzero structure, thereby 3211 reusing matrix information to achieve increased efficiency. 3212 3213 Level: intermediate 3214 3215 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3216 @*/ 3217 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3218 { 3219 PetscFunctionBegin; 3220 PetscCall(MatCreate(comm,A)); 3221 PetscCall(MatSetSizes(*A,m,n,m,n)); 3222 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 3223 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3224 PetscFunctionReturn(0); 3225 } 3226 3227 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3228 { 3229 PetscFunctionBegin; 3230 if (A->factortype == MAT_FACTOR_NONE) { 3231 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 3232 } else { 3233 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3234 } 3235 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3236 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 3237 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 3238 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3239 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3240 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3241 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 3242 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3243 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3244 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 3245 PetscCall(MatDestroy_SeqAIJ(A)); 3246 PetscFunctionReturn(0); 3247 } 3248 3249 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3250 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3251 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3252 { 3253 PetscFunctionBegin; 3254 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 3255 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 3256 PetscFunctionReturn(0); 3257 } 3258 3259 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3260 { 3261 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3262 Mat_SeqAIJCUSPARSE *cy; 3263 Mat_SeqAIJCUSPARSE *cx; 3264 PetscScalar *ay; 3265 const PetscScalar *ax; 3266 CsrMatrix *csry,*csrx; 3267 3268 PetscFunctionBegin; 3269 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3270 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3271 if (X->ops->axpy != Y->ops->axpy) { 3272 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3273 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3274 PetscFunctionReturn(0); 3275 } 3276 /* if we are here, it means both matrices are bound to GPU */ 3277 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3278 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3279 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3280 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3281 csry = (CsrMatrix*)cy->mat->mat; 3282 csrx = (CsrMatrix*)cx->mat->mat; 3283 /* see if we can turn this into a cublas axpy */ 3284 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3285 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3286 if (eq) { 3287 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3288 } 3289 if (eq) str = SAME_NONZERO_PATTERN; 3290 } 3291 /* spgeam is buggy with one column */ 3292 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3293 3294 if (str == SUBSET_NONZERO_PATTERN) { 3295 PetscScalar b = 1.0; 3296 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3297 size_t bufferSize; 3298 void *buffer; 3299 #endif 3300 3301 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3302 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3303 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3304 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3305 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3306 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3307 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3308 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 3309 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 3310 PetscCall(PetscLogGpuTimeBegin()); 3311 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3312 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3313 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3314 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 3315 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3316 PetscCall(PetscLogGpuTimeEnd()); 3317 PetscCallCUDA(cudaFree(buffer)); 3318 #else 3319 PetscCall(PetscLogGpuTimeBegin()); 3320 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3321 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3322 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3323 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 3324 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3325 PetscCall(PetscLogGpuTimeEnd()); 3326 #endif 3327 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3328 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3329 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3330 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3331 } else if (str == SAME_NONZERO_PATTERN) { 3332 cublasHandle_t cublasv2handle; 3333 PetscBLASInt one = 1, bnz = 1; 3334 3335 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3336 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3337 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3338 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 3339 PetscCall(PetscLogGpuTimeBegin()); 3340 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 3341 PetscCall(PetscLogGpuFlops(2.0*bnz)); 3342 PetscCall(PetscLogGpuTimeEnd()); 3343 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3344 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3345 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3346 } else { 3347 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3348 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3349 } 3350 PetscFunctionReturn(0); 3351 } 3352 3353 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3354 { 3355 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3356 PetscScalar *ay; 3357 cublasHandle_t cublasv2handle; 3358 PetscBLASInt one = 1, bnz = 1; 3359 3360 PetscFunctionBegin; 3361 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3362 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3363 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 3364 PetscCall(PetscLogGpuTimeBegin()); 3365 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 3366 PetscCall(PetscLogGpuFlops(bnz)); 3367 PetscCall(PetscLogGpuTimeEnd()); 3368 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3369 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3370 PetscFunctionReturn(0); 3371 } 3372 3373 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3374 { 3375 PetscBool both = PETSC_FALSE; 3376 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3377 3378 PetscFunctionBegin; 3379 if (A->factortype == MAT_FACTOR_NONE) { 3380 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3381 if (spptr->mat) { 3382 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3383 if (matrix->values) { 3384 both = PETSC_TRUE; 3385 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3386 } 3387 } 3388 if (spptr->matTranspose) { 3389 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3390 if (matrix->values) { 3391 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3392 } 3393 } 3394 } 3395 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 3396 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3397 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3398 else A->offloadmask = PETSC_OFFLOAD_CPU; 3399 PetscFunctionReturn(0); 3400 } 3401 3402 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3403 { 3404 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3405 3406 PetscFunctionBegin; 3407 if (A->factortype != MAT_FACTOR_NONE) { 3408 A->boundtocpu = flg; 3409 PetscFunctionReturn(0); 3410 } 3411 if (flg) { 3412 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3413 3414 A->ops->scale = MatScale_SeqAIJ; 3415 A->ops->axpy = MatAXPY_SeqAIJ; 3416 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3417 A->ops->mult = MatMult_SeqAIJ; 3418 A->ops->multadd = MatMultAdd_SeqAIJ; 3419 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3420 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3421 A->ops->multhermitiantranspose = NULL; 3422 A->ops->multhermitiantransposeadd = NULL; 3423 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3424 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 3425 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3426 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3427 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3428 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3429 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3430 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ)); 3431 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3432 } else { 3433 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3434 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3435 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3436 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3437 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3438 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3439 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3440 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3441 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3442 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3443 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3444 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3445 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3446 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3447 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3448 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3449 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3450 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3451 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3452 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3453 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 3454 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3455 } 3456 A->boundtocpu = flg; 3457 if (flg && a->inode.size) { 3458 a->inode.use = PETSC_TRUE; 3459 } else { 3460 a->inode.use = PETSC_FALSE; 3461 } 3462 PetscFunctionReturn(0); 3463 } 3464 3465 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3466 { 3467 Mat B; 3468 3469 PetscFunctionBegin; 3470 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3471 if (reuse == MAT_INITIAL_MATRIX) { 3472 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 3473 } else if (reuse == MAT_REUSE_MATRIX) { 3474 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 3475 } 3476 B = *newmat; 3477 3478 PetscCall(PetscFree(B->defaultvectype)); 3479 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 3480 3481 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3482 if (B->factortype == MAT_FACTOR_NONE) { 3483 Mat_SeqAIJCUSPARSE *spptr; 3484 PetscCall(PetscNew(&spptr)); 3485 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3486 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3487 spptr->format = MAT_CUSPARSE_CSR; 3488 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3489 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3490 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3491 #else 3492 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3493 #endif 3494 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3495 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3496 #endif 3497 B->spptr = spptr; 3498 } else { 3499 Mat_SeqAIJCUSPARSETriFactors *spptr; 3500 3501 PetscCall(PetscNew(&spptr)); 3502 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3503 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3504 B->spptr = spptr; 3505 } 3506 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3507 } 3508 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3509 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3510 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3511 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3512 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3513 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3514 3515 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 3516 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 3517 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3518 #if defined(PETSC_HAVE_HYPRE) 3519 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3520 #endif 3521 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3522 PetscFunctionReturn(0); 3523 } 3524 3525 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3526 { 3527 PetscFunctionBegin; 3528 PetscCall(MatCreate_SeqAIJ(B)); 3529 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 3530 PetscFunctionReturn(0); 3531 } 3532 3533 /*MC 3534 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3535 3536 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3537 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3538 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3539 3540 Options Database Keys: 3541 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3542 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3543 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3544 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3545 3546 Level: beginner 3547 3548 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3549 M*/ 3550 3551 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3552 3553 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3554 { 3555 PetscFunctionBegin; 3556 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 3557 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 3558 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 3559 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 3560 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3561 3562 PetscFunctionReturn(0); 3563 } 3564 3565 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3566 { 3567 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3568 3569 PetscFunctionBegin; 3570 if (!cusp) PetscFunctionReturn(0); 3571 delete cusp->cooPerm; 3572 delete cusp->cooPerm_a; 3573 cusp->cooPerm = NULL; 3574 cusp->cooPerm_a = NULL; 3575 if (cusp->use_extended_coo) { 3576 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3577 PetscCallCUDA(cudaFree(cusp->perm_d)); 3578 } 3579 cusp->use_extended_coo = PETSC_FALSE; 3580 PetscFunctionReturn(0); 3581 } 3582 3583 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3584 { 3585 PetscFunctionBegin; 3586 if (*cusparsestruct) { 3587 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 3588 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 3589 delete (*cusparsestruct)->workVector; 3590 delete (*cusparsestruct)->rowoffsets_gpu; 3591 delete (*cusparsestruct)->cooPerm; 3592 delete (*cusparsestruct)->cooPerm_a; 3593 delete (*cusparsestruct)->csr2csc_i; 3594 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3595 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3596 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3597 PetscCall(PetscFree(*cusparsestruct)); 3598 } 3599 PetscFunctionReturn(0); 3600 } 3601 3602 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3603 { 3604 PetscFunctionBegin; 3605 if (*mat) { 3606 delete (*mat)->values; 3607 delete (*mat)->column_indices; 3608 delete (*mat)->row_offsets; 3609 delete *mat; 3610 *mat = 0; 3611 } 3612 PetscFunctionReturn(0); 3613 } 3614 3615 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3616 { 3617 PetscFunctionBegin; 3618 if (*trifactor) { 3619 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3620 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 3621 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3622 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3623 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3624 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3625 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3626 #endif 3627 PetscCall(PetscFree(*trifactor)); 3628 } 3629 PetscFunctionReturn(0); 3630 } 3631 3632 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3633 { 3634 CsrMatrix *mat; 3635 3636 PetscFunctionBegin; 3637 if (*matstruct) { 3638 if ((*matstruct)->mat) { 3639 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3640 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3641 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3642 #else 3643 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3644 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3645 #endif 3646 } else { 3647 mat = (CsrMatrix*)(*matstruct)->mat; 3648 CsrMatrix_Destroy(&mat); 3649 } 3650 } 3651 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3652 delete (*matstruct)->cprowIndices; 3653 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3654 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3655 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3656 3657 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3658 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3659 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3660 for (int i=0; i<3; i++) { 3661 if (mdata->cuSpMV[i].initialized) { 3662 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3663 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3664 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3665 } 3666 } 3667 #endif 3668 delete *matstruct; 3669 *matstruct = NULL; 3670 } 3671 PetscFunctionReturn(0); 3672 } 3673 3674 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3675 { 3676 PetscFunctionBegin; 3677 if (*trifactors) { 3678 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 3679 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 3680 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 3681 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 3682 delete (*trifactors)->rpermIndices; 3683 delete (*trifactors)->cpermIndices; 3684 delete (*trifactors)->workVector; 3685 (*trifactors)->rpermIndices = NULL; 3686 (*trifactors)->cpermIndices = NULL; 3687 (*trifactors)->workVector = NULL; 3688 if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 3689 if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3690 (*trifactors)->init_dev_prop = PETSC_FALSE; 3691 } 3692 PetscFunctionReturn(0); 3693 } 3694 3695 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3696 { 3697 cusparseHandle_t handle; 3698 3699 PetscFunctionBegin; 3700 if (*trifactors) { 3701 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3702 if (handle = (*trifactors)->handle) { 3703 PetscCallCUSPARSE(cusparseDestroy(handle)); 3704 } 3705 PetscCall(PetscFree(*trifactors)); 3706 } 3707 PetscFunctionReturn(0); 3708 } 3709 3710 struct IJCompare 3711 { 3712 __host__ __device__ 3713 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3714 { 3715 if (t1.get<0>() < t2.get<0>()) return true; 3716 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3717 return false; 3718 } 3719 }; 3720 3721 struct IJEqual 3722 { 3723 __host__ __device__ 3724 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3725 { 3726 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3727 return true; 3728 } 3729 }; 3730 3731 struct IJDiff 3732 { 3733 __host__ __device__ 3734 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3735 { 3736 return t1 == t2 ? 0 : 1; 3737 } 3738 }; 3739 3740 struct IJSum 3741 { 3742 __host__ __device__ 3743 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3744 { 3745 return t1||t2; 3746 } 3747 }; 3748 3749 #include <thrust/iterator/discard_iterator.h> 3750 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3751 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3752 { 3753 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3754 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3755 THRUSTARRAY *cooPerm_v = NULL; 3756 thrust::device_ptr<const PetscScalar> d_v; 3757 CsrMatrix *matrix; 3758 PetscInt n; 3759 3760 PetscFunctionBegin; 3761 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3762 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3763 if (!cusp->cooPerm) { 3764 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 3765 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 3766 PetscFunctionReturn(0); 3767 } 3768 matrix = (CsrMatrix*)cusp->mat->mat; 3769 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3770 if (!v) { 3771 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3772 goto finalize; 3773 } 3774 n = cusp->cooPerm->size(); 3775 if (isCudaMem(v)) { 3776 d_v = thrust::device_pointer_cast(v); 3777 } else { 3778 cooPerm_v = new THRUSTARRAY(n); 3779 cooPerm_v->assign(v,v+n); 3780 d_v = cooPerm_v->data(); 3781 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 3782 } 3783 PetscCall(PetscLogGpuTimeBegin()); 3784 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3785 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3786 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3787 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3788 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3789 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3790 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3791 */ 3792 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3793 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3794 delete cooPerm_w; 3795 } else { 3796 /* all nonzeros in d_v[] are unique entries */ 3797 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3798 matrix->values->begin())); 3799 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3800 matrix->values->end())); 3801 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3802 } 3803 } else { 3804 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3805 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3806 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3807 } else { 3808 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3809 matrix->values->begin())); 3810 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3811 matrix->values->end())); 3812 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3813 } 3814 } 3815 PetscCall(PetscLogGpuTimeEnd()); 3816 finalize: 3817 delete cooPerm_v; 3818 A->offloadmask = PETSC_OFFLOAD_GPU; 3819 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3820 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3821 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 3822 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 3823 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3824 a->reallocs = 0; 3825 A->info.mallocs += 0; 3826 A->info.nz_unneeded = 0; 3827 A->assembled = A->was_assembled = PETSC_TRUE; 3828 A->num_ass++; 3829 PetscFunctionReturn(0); 3830 } 3831 3832 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3833 { 3834 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3835 3836 PetscFunctionBegin; 3837 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3838 if (!cusp) PetscFunctionReturn(0); 3839 if (destroy) { 3840 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3841 delete cusp->csr2csc_i; 3842 cusp->csr2csc_i = NULL; 3843 } 3844 A->transupdated = PETSC_FALSE; 3845 PetscFunctionReturn(0); 3846 } 3847 3848 #include <thrust/binary_search.h> 3849 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3850 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3851 { 3852 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3853 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3854 PetscInt cooPerm_n, nzr = 0; 3855 3856 PetscFunctionBegin; 3857 PetscCall(PetscLayoutSetUp(A->rmap)); 3858 PetscCall(PetscLayoutSetUp(A->cmap)); 3859 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3860 if (n != cooPerm_n) { 3861 delete cusp->cooPerm; 3862 delete cusp->cooPerm_a; 3863 cusp->cooPerm = NULL; 3864 cusp->cooPerm_a = NULL; 3865 } 3866 if (n) { 3867 THRUSTINTARRAY d_i(n); 3868 THRUSTINTARRAY d_j(n); 3869 THRUSTINTARRAY ii(A->rmap->n); 3870 3871 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3872 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3873 3874 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 3875 d_i.assign(coo_i,coo_i+n); 3876 d_j.assign(coo_j,coo_j+n); 3877 3878 /* Ex. 3879 n = 6 3880 coo_i = [3,3,1,4,1,4] 3881 coo_j = [3,2,2,5,2,6] 3882 */ 3883 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3884 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3885 3886 PetscCall(PetscLogGpuTimeBegin()); 3887 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3888 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3889 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3890 THRUSTINTARRAY w = d_j; 3891 3892 /* 3893 d_i = [1,1,3,3,4,4] 3894 d_j = [2,2,2,3,5,6] 3895 cooPerm = [2,4,1,0,3,5] 3896 */ 3897 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3898 3899 /* 3900 d_i = [1,3,3,4,4,x] 3901 ^ekey 3902 d_j = [2,2,3,5,6,x] 3903 ^nekye 3904 */ 3905 if (nekey == ekey) { /* all entries are unique */ 3906 delete cusp->cooPerm_a; 3907 cusp->cooPerm_a = NULL; 3908 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3909 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3910 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3911 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3912 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3913 w[0] = 0; 3914 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3915 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3916 } 3917 thrust::counting_iterator<PetscInt> search_begin(0); 3918 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3919 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3920 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3921 PetscCall(PetscLogGpuTimeEnd()); 3922 3923 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 3924 a->singlemalloc = PETSC_FALSE; 3925 a->free_a = PETSC_TRUE; 3926 a->free_ij = PETSC_TRUE; 3927 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3928 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3929 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3930 a->nz = a->maxnz = a->i[A->rmap->n]; 3931 a->rmax = 0; 3932 PetscCall(PetscMalloc1(a->nz,&a->a)); 3933 PetscCall(PetscMalloc1(a->nz,&a->j)); 3934 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3935 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 3936 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 3937 for (PetscInt i = 0; i < A->rmap->n; i++) { 3938 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3939 nzr += (PetscInt)!!(nnzr); 3940 a->ilen[i] = a->imax[i] = nnzr; 3941 a->rmax = PetscMax(a->rmax,nnzr); 3942 } 3943 a->nonzerorowcnt = nzr; 3944 A->preallocated = PETSC_TRUE; 3945 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 3946 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 3947 } else { 3948 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 3949 } 3950 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 3951 3952 /* We want to allocate the CUSPARSE struct for matvec now. 3953 The code is so convoluted now that I prefer to copy zeros */ 3954 PetscCall(PetscArrayzero(a->a,a->nz)); 3955 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 3956 A->offloadmask = PETSC_OFFLOAD_CPU; 3957 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3958 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 3959 PetscFunctionReturn(0); 3960 } 3961 3962 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3963 { 3964 Mat_SeqAIJ *seq; 3965 Mat_SeqAIJCUSPARSE *dev; 3966 PetscBool coo_basic = PETSC_TRUE; 3967 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 3968 3969 PetscFunctionBegin; 3970 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 3971 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 3972 if (coo_i) { 3973 PetscCall(PetscGetMemType(coo_i,&mtype)); 3974 if (PetscMemTypeHost(mtype)) { 3975 for (PetscCount k=0; k<coo_n; k++) { 3976 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 3977 } 3978 } 3979 } 3980 3981 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 3982 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 3983 } else { 3984 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 3985 mat->offloadmask = PETSC_OFFLOAD_CPU; 3986 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 3987 seq = static_cast<Mat_SeqAIJ*>(mat->data); 3988 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 3989 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 3990 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 3991 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 3992 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 3993 dev->use_extended_coo = PETSC_TRUE; 3994 } 3995 PetscFunctionReturn(0); 3996 } 3997 3998 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 3999 { 4000 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4001 const PetscCount grid_size = gridDim.x * blockDim.x; 4002 for (; i<nnz; i+= grid_size) { 4003 PetscScalar sum = 0.0; 4004 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4005 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4006 } 4007 } 4008 4009 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4010 { 4011 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4012 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4013 PetscCount Annz = seq->nz; 4014 PetscMemType memtype; 4015 const PetscScalar *v1 = v; 4016 PetscScalar *Aa; 4017 4018 PetscFunctionBegin; 4019 if (dev->use_extended_coo) { 4020 PetscCall(PetscGetMemType(v,&memtype)); 4021 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4022 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4023 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4024 } 4025 4026 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4027 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4028 4029 if (Annz) { 4030 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4031 PetscCallCUDA(cudaPeekAtLastError()); 4032 } 4033 4034 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4035 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4036 4037 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4038 } else { 4039 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4040 } 4041 PetscFunctionReturn(0); 4042 } 4043 4044 /*@C 4045 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4046 4047 Not collective 4048 4049 Input Parameters: 4050 + A - the matrix 4051 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4052 4053 Output Parameters: 4054 + ia - the CSR row pointers 4055 - ja - the CSR column indices 4056 4057 Level: developer 4058 4059 Notes: 4060 When compressed is true, the CSR structure does not contain empty rows 4061 4062 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4063 @*/ 4064 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4065 { 4066 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4067 CsrMatrix *csr; 4068 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4069 4070 PetscFunctionBegin; 4071 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4072 if (!i || !j) PetscFunctionReturn(0); 4073 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4074 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4075 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4076 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4077 csr = (CsrMatrix*)cusp->mat->mat; 4078 if (i) { 4079 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4080 if (!cusp->rowoffsets_gpu) { 4081 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4082 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4083 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4084 } 4085 *i = cusp->rowoffsets_gpu->data().get(); 4086 } else *i = csr->row_offsets->data().get(); 4087 } 4088 if (j) *j = csr->column_indices->data().get(); 4089 PetscFunctionReturn(0); 4090 } 4091 4092 /*@C 4093 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4094 4095 Not collective 4096 4097 Input Parameters: 4098 + A - the matrix 4099 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4100 4101 Output Parameters: 4102 + ia - the CSR row pointers 4103 - ja - the CSR column indices 4104 4105 Level: developer 4106 4107 .seealso: MatSeqAIJCUSPARSEGetIJ() 4108 @*/ 4109 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4110 { 4111 PetscFunctionBegin; 4112 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4113 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4114 if (i) *i = NULL; 4115 if (j) *j = NULL; 4116 PetscFunctionReturn(0); 4117 } 4118 4119 /*@C 4120 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4121 4122 Not Collective 4123 4124 Input Parameter: 4125 . A - a MATSEQAIJCUSPARSE matrix 4126 4127 Output Parameter: 4128 . a - pointer to the device data 4129 4130 Level: developer 4131 4132 Notes: may trigger host-device copies if up-to-date matrix data is on host 4133 4134 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4135 @*/ 4136 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4137 { 4138 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4139 CsrMatrix *csr; 4140 4141 PetscFunctionBegin; 4142 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4143 PetscValidPointer(a,2); 4144 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4145 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4146 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4147 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4148 csr = (CsrMatrix*)cusp->mat->mat; 4149 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4150 *a = csr->values->data().get(); 4151 PetscFunctionReturn(0); 4152 } 4153 4154 /*@C 4155 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4156 4157 Not Collective 4158 4159 Input Parameter: 4160 . A - a MATSEQAIJCUSPARSE matrix 4161 4162 Output Parameter: 4163 . a - pointer to the device data 4164 4165 Level: developer 4166 4167 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4168 @*/ 4169 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4170 { 4171 PetscFunctionBegin; 4172 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4173 PetscValidPointer(a,2); 4174 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4175 *a = NULL; 4176 PetscFunctionReturn(0); 4177 } 4178 4179 /*@C 4180 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4181 4182 Not Collective 4183 4184 Input Parameter: 4185 . A - a MATSEQAIJCUSPARSE matrix 4186 4187 Output Parameter: 4188 . a - pointer to the device data 4189 4190 Level: developer 4191 4192 Notes: may trigger host-device copies if up-to-date matrix data is on host 4193 4194 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4195 @*/ 4196 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4197 { 4198 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4199 CsrMatrix *csr; 4200 4201 PetscFunctionBegin; 4202 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4203 PetscValidPointer(a,2); 4204 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4205 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4206 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4207 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4208 csr = (CsrMatrix*)cusp->mat->mat; 4209 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4210 *a = csr->values->data().get(); 4211 A->offloadmask = PETSC_OFFLOAD_GPU; 4212 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4213 PetscFunctionReturn(0); 4214 } 4215 /*@C 4216 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4217 4218 Not Collective 4219 4220 Input Parameter: 4221 . A - a MATSEQAIJCUSPARSE matrix 4222 4223 Output Parameter: 4224 . a - pointer to the device data 4225 4226 Level: developer 4227 4228 .seealso: MatSeqAIJCUSPARSEGetArray() 4229 @*/ 4230 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4231 { 4232 PetscFunctionBegin; 4233 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4234 PetscValidPointer(a,2); 4235 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4236 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4237 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4238 *a = NULL; 4239 PetscFunctionReturn(0); 4240 } 4241 4242 /*@C 4243 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4244 4245 Not Collective 4246 4247 Input Parameter: 4248 . A - a MATSEQAIJCUSPARSE matrix 4249 4250 Output Parameter: 4251 . a - pointer to the device data 4252 4253 Level: developer 4254 4255 Notes: does not trigger host-device copies and flags data validity on the GPU 4256 4257 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4258 @*/ 4259 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4260 { 4261 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4262 CsrMatrix *csr; 4263 4264 PetscFunctionBegin; 4265 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4266 PetscValidPointer(a,2); 4267 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4268 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4269 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4270 csr = (CsrMatrix*)cusp->mat->mat; 4271 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4272 *a = csr->values->data().get(); 4273 A->offloadmask = PETSC_OFFLOAD_GPU; 4274 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4275 PetscFunctionReturn(0); 4276 } 4277 4278 /*@C 4279 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4280 4281 Not Collective 4282 4283 Input Parameter: 4284 . A - a MATSEQAIJCUSPARSE matrix 4285 4286 Output Parameter: 4287 . a - pointer to the device data 4288 4289 Level: developer 4290 4291 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4292 @*/ 4293 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4294 { 4295 PetscFunctionBegin; 4296 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4297 PetscValidPointer(a,2); 4298 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4299 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4300 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4301 *a = NULL; 4302 PetscFunctionReturn(0); 4303 } 4304 4305 struct IJCompare4 4306 { 4307 __host__ __device__ 4308 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4309 { 4310 if (t1.get<0>() < t2.get<0>()) return true; 4311 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4312 return false; 4313 } 4314 }; 4315 4316 struct Shift 4317 { 4318 int _shift; 4319 4320 Shift(int shift) : _shift(shift) {} 4321 __host__ __device__ 4322 inline int operator() (const int &c) 4323 { 4324 return c + _shift; 4325 } 4326 }; 4327 4328 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4329 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4330 { 4331 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4332 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4333 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4334 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4335 PetscInt Annz,Bnnz; 4336 cusparseStatus_t stat; 4337 PetscInt i,m,n,zero = 0; 4338 4339 PetscFunctionBegin; 4340 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4341 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4342 PetscValidPointer(C,4); 4343 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4344 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4345 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4346 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4347 PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4348 PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4349 if (reuse == MAT_INITIAL_MATRIX) { 4350 m = A->rmap->n; 4351 n = A->cmap->n + B->cmap->n; 4352 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 4353 PetscCall(MatSetSizes(*C,m,n,m,n)); 4354 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4355 c = (Mat_SeqAIJ*)(*C)->data; 4356 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4357 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4358 Ccsr = new CsrMatrix; 4359 Cmat->cprowIndices = NULL; 4360 c->compressedrow.use = PETSC_FALSE; 4361 c->compressedrow.nrows = 0; 4362 c->compressedrow.i = NULL; 4363 c->compressedrow.rindex = NULL; 4364 Ccusp->workVector = NULL; 4365 Ccusp->nrows = m; 4366 Ccusp->mat = Cmat; 4367 Ccusp->mat->mat = Ccsr; 4368 Ccsr->num_rows = m; 4369 Ccsr->num_cols = n; 4370 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4371 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4372 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4373 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 4374 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 4375 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4376 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4377 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4378 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4379 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4380 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4381 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4382 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4383 4384 Acsr = (CsrMatrix*)Acusp->mat->mat; 4385 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4386 Annz = (PetscInt)Acsr->column_indices->size(); 4387 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4388 c->nz = Annz + Bnnz; 4389 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4390 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4391 Ccsr->values = new THRUSTARRAY(c->nz); 4392 Ccsr->num_entries = c->nz; 4393 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4394 if (c->nz) { 4395 auto Acoo = new THRUSTINTARRAY32(Annz); 4396 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4397 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4398 THRUSTINTARRAY32 *Aroff,*Broff; 4399 4400 if (a->compressedrow.use) { /* need full row offset */ 4401 if (!Acusp->rowoffsets_gpu) { 4402 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4403 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4404 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4405 } 4406 Aroff = Acusp->rowoffsets_gpu; 4407 } else Aroff = Acsr->row_offsets; 4408 if (b->compressedrow.use) { /* need full row offset */ 4409 if (!Bcusp->rowoffsets_gpu) { 4410 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4411 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4412 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4413 } 4414 Broff = Bcusp->rowoffsets_gpu; 4415 } else Broff = Bcsr->row_offsets; 4416 PetscCall(PetscLogGpuTimeBegin()); 4417 stat = cusparseXcsr2coo(Acusp->handle, 4418 Aroff->data().get(), 4419 Annz, 4420 m, 4421 Acoo->data().get(), 4422 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4423 stat = cusparseXcsr2coo(Bcusp->handle, 4424 Broff->data().get(), 4425 Bnnz, 4426 m, 4427 Bcoo->data().get(), 4428 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4429 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4430 auto Aperm = thrust::make_constant_iterator(1); 4431 auto Bperm = thrust::make_constant_iterator(0); 4432 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4433 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4434 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4435 #else 4436 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4437 auto Bcib = Bcsr->column_indices->begin(); 4438 auto Bcie = Bcsr->column_indices->end(); 4439 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4440 #endif 4441 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4442 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4443 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4444 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4445 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4446 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4447 auto p1 = Ccusp->cooPerm->begin(); 4448 auto p2 = Ccusp->cooPerm->begin(); 4449 thrust::advance(p2,Annz); 4450 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4451 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4452 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4453 #endif 4454 auto cci = thrust::make_counting_iterator(zero); 4455 auto cce = thrust::make_counting_iterator(c->nz); 4456 #if 0 //Errors on SUMMIT cuda 11.1.0 4457 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4458 #else 4459 auto pred = thrust::identity<int>(); 4460 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4461 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4462 #endif 4463 stat = cusparseXcoo2csr(Ccusp->handle, 4464 Ccoo->data().get(), 4465 c->nz, 4466 m, 4467 Ccsr->row_offsets->data().get(), 4468 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4469 PetscCall(PetscLogGpuTimeEnd()); 4470 delete wPerm; 4471 delete Acoo; 4472 delete Bcoo; 4473 delete Ccoo; 4474 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4475 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4476 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4477 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4478 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4479 #endif 4480 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4481 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4482 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4483 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4484 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4485 CsrMatrix *CcsrT = new CsrMatrix; 4486 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4487 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4488 4489 (*C)->form_explicit_transpose = PETSC_TRUE; 4490 (*C)->transupdated = PETSC_TRUE; 4491 Ccusp->rowoffsets_gpu = NULL; 4492 CmatT->cprowIndices = NULL; 4493 CmatT->mat = CcsrT; 4494 CcsrT->num_rows = n; 4495 CcsrT->num_cols = m; 4496 CcsrT->num_entries = c->nz; 4497 4498 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4499 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4500 CcsrT->values = new THRUSTARRAY(c->nz); 4501 4502 PetscCall(PetscLogGpuTimeBegin()); 4503 auto rT = CcsrT->row_offsets->begin(); 4504 if (AT) { 4505 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4506 thrust::advance(rT,-1); 4507 } 4508 if (BT) { 4509 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4510 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4511 thrust::copy(titb,tite,rT); 4512 } 4513 auto cT = CcsrT->column_indices->begin(); 4514 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4515 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4516 auto vT = CcsrT->values->begin(); 4517 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4518 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4519 PetscCall(PetscLogGpuTimeEnd()); 4520 4521 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4522 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4523 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4524 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 4525 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 4526 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4527 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4528 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4529 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4530 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4531 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4532 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4533 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4534 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4535 #endif 4536 Ccusp->matTranspose = CmatT; 4537 } 4538 } 4539 4540 c->singlemalloc = PETSC_FALSE; 4541 c->free_a = PETSC_TRUE; 4542 c->free_ij = PETSC_TRUE; 4543 PetscCall(PetscMalloc1(m+1,&c->i)); 4544 PetscCall(PetscMalloc1(c->nz,&c->j)); 4545 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4546 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4547 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4548 ii = *Ccsr->row_offsets; 4549 jj = *Ccsr->column_indices; 4550 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4551 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4552 } else { 4553 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4554 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4555 } 4556 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 4557 PetscCall(PetscMalloc1(m,&c->ilen)); 4558 PetscCall(PetscMalloc1(m,&c->imax)); 4559 c->maxnz = c->nz; 4560 c->nonzerorowcnt = 0; 4561 c->rmax = 0; 4562 for (i = 0; i < m; i++) { 4563 const PetscInt nn = c->i[i+1] - c->i[i]; 4564 c->ilen[i] = c->imax[i] = nn; 4565 c->nonzerorowcnt += (PetscInt)!!nn; 4566 c->rmax = PetscMax(c->rmax,nn); 4567 } 4568 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4569 PetscCall(PetscMalloc1(c->nz,&c->a)); 4570 (*C)->nonzerostate++; 4571 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4572 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4573 Ccusp->nonzerostate = (*C)->nonzerostate; 4574 (*C)->preallocated = PETSC_TRUE; 4575 } else { 4576 PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4577 c = (Mat_SeqAIJ*)(*C)->data; 4578 if (c->nz) { 4579 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4580 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4581 PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4582 PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4583 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4584 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4585 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4586 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4587 Acsr = (CsrMatrix*)Acusp->mat->mat; 4588 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4589 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4590 PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4591 PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4592 PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4593 PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4594 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4595 auto pmid = Ccusp->cooPerm->begin(); 4596 thrust::advance(pmid,Acsr->num_entries); 4597 PetscCall(PetscLogGpuTimeBegin()); 4598 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4599 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4600 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4601 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4602 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4603 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4604 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4605 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4606 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4607 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4608 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 4609 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4610 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4611 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4612 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4613 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4614 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4615 auto vT = CcsrT->values->begin(); 4616 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4617 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4618 (*C)->transupdated = PETSC_TRUE; 4619 } 4620 PetscCall(PetscLogGpuTimeEnd()); 4621 } 4622 } 4623 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4624 (*C)->assembled = PETSC_TRUE; 4625 (*C)->was_assembled = PETSC_FALSE; 4626 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4627 PetscFunctionReturn(0); 4628 } 4629 4630 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4631 { 4632 bool dmem; 4633 const PetscScalar *av; 4634 4635 PetscFunctionBegin; 4636 dmem = isCudaMem(v); 4637 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4638 if (n && idx) { 4639 THRUSTINTARRAY widx(n); 4640 widx.assign(idx,idx+n); 4641 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4642 4643 THRUSTARRAY *w = NULL; 4644 thrust::device_ptr<PetscScalar> dv; 4645 if (dmem) { 4646 dv = thrust::device_pointer_cast(v); 4647 } else { 4648 w = new THRUSTARRAY(n); 4649 dv = w->data(); 4650 } 4651 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4652 4653 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4654 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4655 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4656 if (w) { 4657 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4658 } 4659 delete w; 4660 } else { 4661 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4662 } 4663 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4664 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4665 PetscFunctionReturn(0); 4666 } 4667