1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 96 { 97 PetscFunctionBegin; 98 *type = MATSOLVERCUSPARSE; 99 PetscFunctionReturn(0); 100 } 101 102 /*MC 103 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 104 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 105 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 106 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 107 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 108 algorithms are not recommended. This class does NOT support direct solver operations. 109 110 Level: beginner 111 112 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 113 M*/ 114 115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 116 { 117 PetscInt n = A->rmap->n; 118 119 PetscFunctionBegin; 120 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 121 PetscCall(MatSetSizes(*B,n,n,n,n)); 122 (*B)->factortype = ftype; 123 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 124 125 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 126 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 127 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 128 if (!A->boundtocpu) { 129 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 130 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 131 } else { 132 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 133 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 134 } 135 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 136 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 137 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 138 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 139 if (!A->boundtocpu) { 140 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 141 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 142 } else { 143 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 144 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 145 } 146 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 147 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 148 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 149 150 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 151 (*B)->canuseordering = PETSC_TRUE; 152 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 153 PetscFunctionReturn(0); 154 } 155 156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 157 { 158 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 159 160 PetscFunctionBegin; 161 switch (op) { 162 case MAT_CUSPARSE_MULT: 163 cusparsestruct->format = format; 164 break; 165 case MAT_CUSPARSE_ALL: 166 cusparsestruct->format = format; 167 break; 168 default: 169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 170 } 171 PetscFunctionReturn(0); 172 } 173 174 /*@ 175 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 176 operation. Only the MatMult operation can use different GPU storage formats 177 for MPIAIJCUSPARSE matrices. 178 Not Collective 179 180 Input Parameters: 181 + A - Matrix of type SEQAIJCUSPARSE 182 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 183 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 184 185 Output Parameter: 186 187 Level: intermediate 188 189 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 190 @*/ 191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 192 { 193 PetscFunctionBegin; 194 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 195 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 196 PetscFunctionReturn(0); 197 } 198 199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 200 { 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202 203 PetscFunctionBegin; 204 cusparsestruct->use_cpu_solve = use_cpu; 205 PetscFunctionReturn(0); 206 } 207 208 /*@ 209 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 210 211 Input Parameters: 212 + A - Matrix of type SEQAIJCUSPARSE 213 - use_cpu - set flag for using the built-in CPU MatSolve 214 215 Output Parameter: 216 217 Notes: 218 The cuSparse LU solver currently computes the factors with the built-in CPU method 219 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 220 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 221 222 Level: intermediate 223 224 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 225 @*/ 226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 227 { 228 PetscFunctionBegin; 229 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 230 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 231 PetscFunctionReturn(0); 232 } 233 234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 235 { 236 PetscFunctionBegin; 237 switch (op) { 238 case MAT_FORM_EXPLICIT_TRANSPOSE: 239 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 240 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 241 A->form_explicit_transpose = flg; 242 break; 243 default: 244 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 245 break; 246 } 247 PetscFunctionReturn(0); 248 } 249 250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 251 252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 253 { 254 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 255 IS isrow = b->row,iscol = b->col; 256 PetscBool row_identity,col_identity; 257 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 258 259 PetscFunctionBegin; 260 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 261 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 262 B->offloadmask = PETSC_OFFLOAD_CPU; 263 /* determine which version of MatSolve needs to be used. */ 264 PetscCall(ISIdentity(isrow,&row_identity)); 265 PetscCall(ISIdentity(iscol,&col_identity)); 266 if (row_identity && col_identity) { 267 if (!cusparsestruct->use_cpu_solve) { 268 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 269 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 270 } 271 B->ops->matsolve = NULL; 272 B->ops->matsolvetranspose = NULL; 273 } else { 274 if (!cusparsestruct->use_cpu_solve) { 275 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 276 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 277 } 278 B->ops->matsolve = NULL; 279 B->ops->matsolvetranspose = NULL; 280 } 281 282 /* get the triangular factors */ 283 if (!cusparsestruct->use_cpu_solve) { 284 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 285 } 286 PetscFunctionReturn(0); 287 } 288 289 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 290 { 291 PetscErrorCode ierr; 292 MatCUSPARSEStorageFormat format; 293 PetscBool flg; 294 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 295 296 PetscFunctionBegin; 297 PetscCall(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options")); 298 if (A->factortype == MAT_FACTOR_NONE) { 299 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 300 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr); 301 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 302 303 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 304 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr); 305 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 306 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 307 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 309 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 310 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);PetscCall(ierr); 311 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 312 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 313 PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 314 #else 315 PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 316 #endif 317 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 318 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);PetscCall(ierr); 319 PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 320 321 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 322 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);PetscCall(ierr); 323 PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 324 #endif 325 } 326 PetscCall(PetscOptionsTail()); 327 PetscFunctionReturn(0); 328 } 329 330 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 331 { 332 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 333 334 PetscFunctionBegin; 335 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 336 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 337 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 342 { 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 344 345 PetscFunctionBegin; 346 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 347 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 348 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 349 PetscFunctionReturn(0); 350 } 351 352 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 353 { 354 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 355 356 PetscFunctionBegin; 357 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 358 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 359 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 364 { 365 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 366 367 PetscFunctionBegin; 368 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 369 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 370 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 371 PetscFunctionReturn(0); 372 } 373 374 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 375 { 376 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 377 PetscInt n = A->rmap->n; 378 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 379 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 380 const PetscInt *ai = a->i,*aj = a->j,*vi; 381 const MatScalar *aa = a->a,*v; 382 PetscInt *AiLo, *AjLo; 383 PetscInt i,nz, nzLower, offset, rowOffset; 384 385 PetscFunctionBegin; 386 if (!n) PetscFunctionReturn(0); 387 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 388 try { 389 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 390 nzLower=n+ai[n]-ai[1]; 391 if (!loTriFactor) { 392 PetscScalar *AALo; 393 394 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 395 396 /* Allocate Space for the lower triangular matrix */ 397 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 398 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 399 400 /* Fill the lower triangular matrix */ 401 AiLo[0] = (PetscInt) 0; 402 AiLo[n] = nzLower; 403 AjLo[0] = (PetscInt) 0; 404 AALo[0] = (MatScalar) 1.0; 405 v = aa; 406 vi = aj; 407 offset = 1; 408 rowOffset= 1; 409 for (i=1; i<n; i++) { 410 nz = ai[i+1] - ai[i]; 411 /* additional 1 for the term on the diagonal */ 412 AiLo[i] = rowOffset; 413 rowOffset += nz+1; 414 415 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 416 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 417 418 offset += nz; 419 AjLo[offset] = (PetscInt) i; 420 AALo[offset] = (MatScalar) 1.0; 421 offset += 1; 422 423 v += nz; 424 vi += nz; 425 } 426 427 /* allocate space for the triangular factor information */ 428 PetscCall(PetscNew(&loTriFactor)); 429 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 430 /* Create the matrix description */ 431 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 432 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 433 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 434 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 435 #else 436 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 437 #endif 438 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 439 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 440 441 /* set the operation */ 442 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 443 444 /* set the matrix */ 445 loTriFactor->csrMat = new CsrMatrix; 446 loTriFactor->csrMat->num_rows = n; 447 loTriFactor->csrMat->num_cols = n; 448 loTriFactor->csrMat->num_entries = nzLower; 449 450 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 451 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 452 453 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 454 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 455 456 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 457 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 458 459 /* Create the solve analysis information */ 460 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 461 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 462 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 463 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 464 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 465 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 466 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 467 &loTriFactor->solveBufferSize)); 468 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 469 #endif 470 471 /* perform the solve analysis */ 472 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 473 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 474 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 475 loTriFactor->csrMat->column_indices->data().get(), 476 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 477 loTriFactor->solveInfo, 478 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 479 #else 480 loTriFactor->solveInfo)); 481 #endif 482 PetscCallCUDA(WaitForCUDA()); 483 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 484 485 /* assign the pointer */ 486 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 487 loTriFactor->AA_h = AALo; 488 PetscCallCUDA(cudaFreeHost(AiLo)); 489 PetscCallCUDA(cudaFreeHost(AjLo)); 490 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 491 } else { /* update values only */ 492 if (!loTriFactor->AA_h) { 493 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 494 } 495 /* Fill the lower triangular matrix */ 496 loTriFactor->AA_h[0] = 1.0; 497 v = aa; 498 vi = aj; 499 offset = 1; 500 for (i=1; i<n; i++) { 501 nz = ai[i+1] - ai[i]; 502 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 503 offset += nz; 504 loTriFactor->AA_h[offset] = 1.0; 505 offset += 1; 506 v += nz; 507 } 508 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 509 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 510 } 511 } catch(char *ex) { 512 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 513 } 514 } 515 PetscFunctionReturn(0); 516 } 517 518 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 519 { 520 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 521 PetscInt n = A->rmap->n; 522 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 523 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 524 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 525 const MatScalar *aa = a->a,*v; 526 PetscInt *AiUp, *AjUp; 527 PetscInt i,nz, nzUpper, offset; 528 529 PetscFunctionBegin; 530 if (!n) PetscFunctionReturn(0); 531 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 532 try { 533 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 534 nzUpper = adiag[0]-adiag[n]; 535 if (!upTriFactor) { 536 PetscScalar *AAUp; 537 538 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 539 540 /* Allocate Space for the upper triangular matrix */ 541 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 542 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 543 544 /* Fill the upper triangular matrix */ 545 AiUp[0]=(PetscInt) 0; 546 AiUp[n]=nzUpper; 547 offset = nzUpper; 548 for (i=n-1; i>=0; i--) { 549 v = aa + adiag[i+1] + 1; 550 vi = aj + adiag[i+1] + 1; 551 552 /* number of elements NOT on the diagonal */ 553 nz = adiag[i] - adiag[i+1]-1; 554 555 /* decrement the offset */ 556 offset -= (nz+1); 557 558 /* first, set the diagonal elements */ 559 AjUp[offset] = (PetscInt) i; 560 AAUp[offset] = (MatScalar)1./v[nz]; 561 AiUp[i] = AiUp[i+1] - (nz+1); 562 563 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 564 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 565 } 566 567 /* allocate space for the triangular factor information */ 568 PetscCall(PetscNew(&upTriFactor)); 569 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 570 571 /* Create the matrix description */ 572 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 573 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 575 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 576 #else 577 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 578 #endif 579 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 580 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 581 582 /* set the operation */ 583 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 584 585 /* set the matrix */ 586 upTriFactor->csrMat = new CsrMatrix; 587 upTriFactor->csrMat->num_rows = n; 588 upTriFactor->csrMat->num_cols = n; 589 upTriFactor->csrMat->num_entries = nzUpper; 590 591 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 592 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 593 594 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 595 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 596 597 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 598 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 599 600 /* Create the solve analysis information */ 601 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 602 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 604 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 605 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 606 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 607 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 608 &upTriFactor->solveBufferSize)); 609 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 610 #endif 611 612 /* perform the solve analysis */ 613 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 614 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 615 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 616 upTriFactor->csrMat->column_indices->data().get(), 617 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 618 upTriFactor->solveInfo, 619 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 620 #else 621 upTriFactor->solveInfo)); 622 #endif 623 PetscCallCUDA(WaitForCUDA()); 624 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 625 626 /* assign the pointer */ 627 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 628 upTriFactor->AA_h = AAUp; 629 PetscCallCUDA(cudaFreeHost(AiUp)); 630 PetscCallCUDA(cudaFreeHost(AjUp)); 631 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 632 } else { 633 if (!upTriFactor->AA_h) { 634 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 635 } 636 /* Fill the upper triangular matrix */ 637 offset = nzUpper; 638 for (i=n-1; i>=0; i--) { 639 v = aa + adiag[i+1] + 1; 640 641 /* number of elements NOT on the diagonal */ 642 nz = adiag[i] - adiag[i+1]-1; 643 644 /* decrement the offset */ 645 offset -= (nz+1); 646 647 /* first, set the diagonal elements */ 648 upTriFactor->AA_h[offset] = 1./v[nz]; 649 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 650 } 651 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 652 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 653 } 654 } catch(char *ex) { 655 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 656 } 657 } 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 662 { 663 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 664 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 665 IS isrow = a->row,iscol = a->icol; 666 PetscBool row_identity,col_identity; 667 PetscInt n = A->rmap->n; 668 669 PetscFunctionBegin; 670 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 671 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 672 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 673 674 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 675 cusparseTriFactors->nnz=a->nz; 676 677 A->offloadmask = PETSC_OFFLOAD_BOTH; 678 /* lower triangular indices */ 679 PetscCall(ISIdentity(isrow,&row_identity)); 680 if (!row_identity && !cusparseTriFactors->rpermIndices) { 681 const PetscInt *r; 682 683 PetscCall(ISGetIndices(isrow,&r)); 684 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 685 cusparseTriFactors->rpermIndices->assign(r, r+n); 686 PetscCall(ISRestoreIndices(isrow,&r)); 687 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 688 } 689 690 /* upper triangular indices */ 691 PetscCall(ISIdentity(iscol,&col_identity)); 692 if (!col_identity && !cusparseTriFactors->cpermIndices) { 693 const PetscInt *c; 694 695 PetscCall(ISGetIndices(iscol,&c)); 696 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 697 cusparseTriFactors->cpermIndices->assign(c, c+n); 698 PetscCall(ISRestoreIndices(iscol,&c)); 699 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 700 } 701 PetscFunctionReturn(0); 702 } 703 704 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 705 { 706 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 707 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 708 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 709 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 710 PetscInt *AiUp, *AjUp; 711 PetscScalar *AAUp; 712 PetscScalar *AALo; 713 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 714 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 715 const PetscInt *ai = b->i,*aj = b->j,*vj; 716 const MatScalar *aa = b->a,*v; 717 718 PetscFunctionBegin; 719 if (!n) PetscFunctionReturn(0); 720 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 721 try { 722 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 723 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 724 if (!upTriFactor && !loTriFactor) { 725 /* Allocate Space for the upper triangular matrix */ 726 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 727 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 728 729 /* Fill the upper triangular matrix */ 730 AiUp[0]=(PetscInt) 0; 731 AiUp[n]=nzUpper; 732 offset = 0; 733 for (i=0; i<n; i++) { 734 /* set the pointers */ 735 v = aa + ai[i]; 736 vj = aj + ai[i]; 737 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 738 739 /* first, set the diagonal elements */ 740 AjUp[offset] = (PetscInt) i; 741 AAUp[offset] = (MatScalar)1.0/v[nz]; 742 AiUp[i] = offset; 743 AALo[offset] = (MatScalar)1.0/v[nz]; 744 745 offset+=1; 746 if (nz>0) { 747 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 748 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 749 for (j=offset; j<offset+nz; j++) { 750 AAUp[j] = -AAUp[j]; 751 AALo[j] = AAUp[j]/v[nz]; 752 } 753 offset+=nz; 754 } 755 } 756 757 /* allocate space for the triangular factor information */ 758 PetscCall(PetscNew(&upTriFactor)); 759 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 760 761 /* Create the matrix description */ 762 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 763 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 764 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 765 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 766 #else 767 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 768 #endif 769 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 770 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 771 772 /* set the matrix */ 773 upTriFactor->csrMat = new CsrMatrix; 774 upTriFactor->csrMat->num_rows = A->rmap->n; 775 upTriFactor->csrMat->num_cols = A->cmap->n; 776 upTriFactor->csrMat->num_entries = a->nz; 777 778 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 779 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 780 781 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 782 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 783 784 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 785 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 786 787 /* set the operation */ 788 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 789 790 /* Create the solve analysis information */ 791 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 792 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 793 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 794 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 795 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 796 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 797 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 798 &upTriFactor->solveBufferSize)); 799 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 800 #endif 801 802 /* perform the solve analysis */ 803 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 804 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 805 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 806 upTriFactor->csrMat->column_indices->data().get(), 807 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 808 upTriFactor->solveInfo, 809 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 810 #else 811 upTriFactor->solveInfo)); 812 #endif 813 PetscCallCUDA(WaitForCUDA()); 814 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 815 816 /* assign the pointer */ 817 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 818 819 /* allocate space for the triangular factor information */ 820 PetscCall(PetscNew(&loTriFactor)); 821 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822 823 /* Create the matrix description */ 824 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 825 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 826 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 827 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 828 #else 829 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 830 #endif 831 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 832 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 833 834 /* set the operation */ 835 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 836 837 /* set the matrix */ 838 loTriFactor->csrMat = new CsrMatrix; 839 loTriFactor->csrMat->num_rows = A->rmap->n; 840 loTriFactor->csrMat->num_cols = A->cmap->n; 841 loTriFactor->csrMat->num_entries = a->nz; 842 843 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 844 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 845 846 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 847 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 848 849 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 850 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 851 852 /* Create the solve analysis information */ 853 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 854 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 855 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 856 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 857 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 858 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 859 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 860 &loTriFactor->solveBufferSize)); 861 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 862 #endif 863 864 /* perform the solve analysis */ 865 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 866 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 867 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 868 loTriFactor->csrMat->column_indices->data().get(), 869 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 870 loTriFactor->solveInfo, 871 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 872 #else 873 loTriFactor->solveInfo)); 874 #endif 875 PetscCallCUDA(WaitForCUDA()); 876 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 877 878 /* assign the pointer */ 879 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 880 881 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 882 PetscCallCUDA(cudaFreeHost(AiUp)); 883 PetscCallCUDA(cudaFreeHost(AjUp)); 884 } else { 885 /* Fill the upper triangular matrix */ 886 offset = 0; 887 for (i=0; i<n; i++) { 888 /* set the pointers */ 889 v = aa + ai[i]; 890 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 891 892 /* first, set the diagonal elements */ 893 AAUp[offset] = 1.0/v[nz]; 894 AALo[offset] = 1.0/v[nz]; 895 896 offset+=1; 897 if (nz>0) { 898 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 899 for (j=offset; j<offset+nz; j++) { 900 AAUp[j] = -AAUp[j]; 901 AALo[j] = AAUp[j]/v[nz]; 902 } 903 offset+=nz; 904 } 905 } 906 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 908 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 909 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 910 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 911 } 912 PetscCallCUDA(cudaFreeHost(AAUp)); 913 PetscCallCUDA(cudaFreeHost(AALo)); 914 } catch(char *ex) { 915 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 916 } 917 } 918 PetscFunctionReturn(0); 919 } 920 921 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 922 { 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 924 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 925 IS ip = a->row; 926 PetscBool perm_identity; 927 PetscInt n = A->rmap->n; 928 929 PetscFunctionBegin; 930 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 931 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 932 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 933 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 934 935 A->offloadmask = PETSC_OFFLOAD_BOTH; 936 937 /* lower triangular indices */ 938 PetscCall(ISIdentity(ip,&perm_identity)); 939 if (!perm_identity) { 940 IS iip; 941 const PetscInt *irip,*rip; 942 943 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 944 PetscCall(ISGetIndices(iip,&irip)); 945 PetscCall(ISGetIndices(ip,&rip)); 946 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 947 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 948 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 949 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 950 PetscCall(ISRestoreIndices(iip,&irip)); 951 PetscCall(ISDestroy(&iip)); 952 PetscCall(ISRestoreIndices(ip,&rip)); 953 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 954 } 955 PetscFunctionReturn(0); 956 } 957 958 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 959 { 960 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 961 IS ip = b->row; 962 PetscBool perm_identity; 963 964 PetscFunctionBegin; 965 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 966 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 967 B->offloadmask = PETSC_OFFLOAD_CPU; 968 /* determine which version of MatSolve needs to be used. */ 969 PetscCall(ISIdentity(ip,&perm_identity)); 970 if (perm_identity) { 971 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 972 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 973 B->ops->matsolve = NULL; 974 B->ops->matsolvetranspose = NULL; 975 } else { 976 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 977 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 978 B->ops->matsolve = NULL; 979 B->ops->matsolvetranspose = NULL; 980 } 981 982 /* get the triangular factors */ 983 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 984 PetscFunctionReturn(0); 985 } 986 987 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 988 { 989 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 990 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 991 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 992 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 993 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 994 cusparseIndexBase_t indexBase; 995 cusparseMatrixType_t matrixType; 996 cusparseFillMode_t fillMode; 997 cusparseDiagType_t diagType; 998 999 PetscFunctionBegin; 1000 /* allocate space for the transpose of the lower triangular factor */ 1001 PetscCall(PetscNew(&loTriFactorT)); 1002 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1003 1004 /* set the matrix descriptors of the lower triangular factor */ 1005 matrixType = cusparseGetMatType(loTriFactor->descr); 1006 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1007 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1008 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1009 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1010 1011 /* Create the matrix description */ 1012 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1013 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1014 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1015 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1016 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1017 1018 /* set the operation */ 1019 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1020 1021 /* allocate GPU space for the CSC of the lower triangular factor*/ 1022 loTriFactorT->csrMat = new CsrMatrix; 1023 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1024 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1025 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1026 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1027 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1028 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1029 1030 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1031 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1032 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1033 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1034 loTriFactor->csrMat->values->data().get(), 1035 loTriFactor->csrMat->row_offsets->data().get(), 1036 loTriFactor->csrMat->column_indices->data().get(), 1037 loTriFactorT->csrMat->values->data().get(), 1038 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039 CUSPARSE_ACTION_NUMERIC,indexBase, 1040 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1041 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1042 #endif 1043 1044 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1045 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047 loTriFactor->csrMat->values->data().get(), 1048 loTriFactor->csrMat->row_offsets->data().get(), 1049 loTriFactor->csrMat->column_indices->data().get(), 1050 loTriFactorT->csrMat->values->data().get(), 1051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1052 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1053 CUSPARSE_ACTION_NUMERIC, indexBase, 1054 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1055 #else 1056 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1057 CUSPARSE_ACTION_NUMERIC, indexBase)); 1058 #endif 1059 PetscCallCUDA(WaitForCUDA()); 1060 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1061 1062 /* Create the solve analysis information */ 1063 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1064 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1066 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1067 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1068 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1070 &loTriFactorT->solveBufferSize)); 1071 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1072 #endif 1073 1074 /* perform the solve analysis */ 1075 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 loTriFactorT->solveInfo, 1081 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1082 #else 1083 loTriFactorT->solveInfo)); 1084 #endif 1085 PetscCallCUDA(WaitForCUDA()); 1086 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1087 1088 /* assign the pointer */ 1089 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1090 1091 /*********************************************/ 1092 /* Now the Transpose of the Upper Tri Factor */ 1093 /*********************************************/ 1094 1095 /* allocate space for the transpose of the upper triangular factor */ 1096 PetscCall(PetscNew(&upTriFactorT)); 1097 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1098 1099 /* set the matrix descriptors of the upper triangular factor */ 1100 matrixType = cusparseGetMatType(upTriFactor->descr); 1101 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1102 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1103 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1104 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1105 1106 /* Create the matrix description */ 1107 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1108 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1109 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1110 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1111 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1112 1113 /* set the operation */ 1114 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1115 1116 /* allocate GPU space for the CSC of the upper triangular factor*/ 1117 upTriFactorT->csrMat = new CsrMatrix; 1118 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1119 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1120 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1121 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1122 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1123 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1124 1125 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1126 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1127 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1128 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1129 upTriFactor->csrMat->values->data().get(), 1130 upTriFactor->csrMat->row_offsets->data().get(), 1131 upTriFactor->csrMat->column_indices->data().get(), 1132 upTriFactorT->csrMat->values->data().get(), 1133 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1134 CUSPARSE_ACTION_NUMERIC,indexBase, 1135 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1136 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1137 #endif 1138 1139 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1140 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1141 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1142 upTriFactor->csrMat->values->data().get(), 1143 upTriFactor->csrMat->row_offsets->data().get(), 1144 upTriFactor->csrMat->column_indices->data().get(), 1145 upTriFactorT->csrMat->values->data().get(), 1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1147 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1148 CUSPARSE_ACTION_NUMERIC, indexBase, 1149 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1150 #else 1151 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1152 CUSPARSE_ACTION_NUMERIC, indexBase)); 1153 #endif 1154 1155 PetscCallCUDA(WaitForCUDA()); 1156 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1157 1158 /* Create the solve analysis information */ 1159 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1160 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 1161 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1162 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1163 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1164 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1165 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1166 &upTriFactorT->solveBufferSize)); 1167 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1168 #endif 1169 1170 /* perform the solve analysis */ 1171 /* christ, would it have killed you to put this stuff in a function????????? */ 1172 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1173 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1174 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1175 upTriFactorT->csrMat->column_indices->data().get(), 1176 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1177 upTriFactorT->solveInfo, 1178 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1179 #else 1180 upTriFactorT->solveInfo)); 1181 #endif 1182 1183 PetscCallCUDA(WaitForCUDA()); 1184 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1185 1186 /* assign the pointer */ 1187 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1188 PetscFunctionReturn(0); 1189 } 1190 1191 struct PetscScalarToPetscInt 1192 { 1193 __host__ __device__ 1194 PetscInt operator()(PetscScalar s) 1195 { 1196 return (PetscInt)PetscRealPart(s); 1197 } 1198 }; 1199 1200 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1201 { 1202 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1203 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1204 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1205 cusparseStatus_t stat; 1206 cusparseIndexBase_t indexBase; 1207 1208 PetscFunctionBegin; 1209 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1210 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1211 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1212 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1213 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1214 if (A->transupdated) PetscFunctionReturn(0); 1215 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1216 PetscCall(PetscLogGpuTimeBegin()); 1217 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1218 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1219 } 1220 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1221 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1222 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1223 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1224 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1225 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1226 1227 /* set alpha and beta */ 1228 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1229 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1230 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1231 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1232 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1233 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1234 1235 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1236 CsrMatrix *matrixT = new CsrMatrix; 1237 matstructT->mat = matrixT; 1238 matrixT->num_rows = A->cmap->n; 1239 matrixT->num_cols = A->rmap->n; 1240 matrixT->num_entries = a->nz; 1241 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1242 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1243 matrixT->values = new THRUSTARRAY(a->nz); 1244 1245 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1246 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1247 1248 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1249 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1250 stat = cusparseCreateCsr(&matstructT->matDescr, 1251 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1252 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1253 matrixT->values->data().get(), 1254 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1255 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1256 #else 1257 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1258 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1259 1260 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1261 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1262 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1263 */ 1264 if (matrixT->num_entries) { 1265 stat = cusparseCreateCsr(&matstructT->matDescr, 1266 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1267 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1268 matrixT->values->data().get(), 1269 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1270 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1271 1272 } else { 1273 matstructT->matDescr = NULL; 1274 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1275 } 1276 #endif 1277 #endif 1278 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1279 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1280 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1281 #else 1282 CsrMatrix *temp = new CsrMatrix; 1283 CsrMatrix *tempT = new CsrMatrix; 1284 /* First convert HYB to CSR */ 1285 temp->num_rows = A->rmap->n; 1286 temp->num_cols = A->cmap->n; 1287 temp->num_entries = a->nz; 1288 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1289 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1290 temp->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_hyb2csr(cusparsestruct->handle, 1293 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1294 temp->values->data().get(), 1295 temp->row_offsets->data().get(), 1296 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1297 1298 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1299 tempT->num_rows = A->rmap->n; 1300 tempT->num_cols = A->cmap->n; 1301 tempT->num_entries = a->nz; 1302 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1303 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1304 tempT->values = new THRUSTARRAY(a->nz); 1305 1306 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1307 temp->num_cols, temp->num_entries, 1308 temp->values->data().get(), 1309 temp->row_offsets->data().get(), 1310 temp->column_indices->data().get(), 1311 tempT->values->data().get(), 1312 tempT->column_indices->data().get(), 1313 tempT->row_offsets->data().get(), 1314 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1315 1316 /* Last, convert CSC to HYB */ 1317 cusparseHybMat_t hybMat; 1318 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1319 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1320 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1321 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1322 matstructT->descr, tempT->values->data().get(), 1323 tempT->row_offsets->data().get(), 1324 tempT->column_indices->data().get(), 1325 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1326 1327 /* assign the pointer */ 1328 matstructT->mat = hybMat; 1329 A->transupdated = PETSC_TRUE; 1330 /* delete temporaries */ 1331 if (tempT) { 1332 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1333 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1334 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1335 delete (CsrMatrix*) tempT; 1336 } 1337 if (temp) { 1338 if (temp->values) delete (THRUSTARRAY*) temp->values; 1339 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1340 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1341 delete (CsrMatrix*) temp; 1342 } 1343 #endif 1344 } 1345 } 1346 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1347 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1348 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1349 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1350 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1351 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1352 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1353 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1354 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1355 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1356 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1357 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1358 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1359 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1360 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1361 } 1362 if (!cusparsestruct->csr2csc_i) { 1363 THRUSTARRAY csr2csc_a(matrix->num_entries); 1364 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1365 1366 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1367 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1368 void *csr2cscBuffer; 1369 size_t csr2cscBufferSize; 1370 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1371 A->cmap->n, matrix->num_entries, 1372 matrix->values->data().get(), 1373 cusparsestruct->rowoffsets_gpu->data().get(), 1374 matrix->column_indices->data().get(), 1375 matrixT->values->data().get(), 1376 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1377 CUSPARSE_ACTION_NUMERIC,indexBase, 1378 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1379 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1380 #endif 1381 1382 if (matrix->num_entries) { 1383 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1384 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1385 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1386 1387 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1388 should be filled with indexBase. So I just take a shortcut here. 1389 */ 1390 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1391 A->cmap->n,matrix->num_entries, 1392 csr2csc_a.data().get(), 1393 cusparsestruct->rowoffsets_gpu->data().get(), 1394 matrix->column_indices->data().get(), 1395 matrixT->values->data().get(), 1396 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1397 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1398 CUSPARSE_ACTION_NUMERIC,indexBase, 1399 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1400 #else 1401 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1402 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1403 #endif 1404 } else { 1405 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1406 } 1407 1408 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1409 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1410 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1411 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1412 #endif 1413 } 1414 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1415 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1416 matrixT->values->begin())); 1417 } 1418 PetscCall(PetscLogGpuTimeEnd()); 1419 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1420 /* the compressed row indices is not used for matTranspose */ 1421 matstructT->cprowIndices = NULL; 1422 /* assign the pointer */ 1423 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1424 A->transupdated = PETSC_TRUE; 1425 PetscFunctionReturn(0); 1426 } 1427 1428 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1429 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1430 { 1431 PetscInt n = xx->map->n; 1432 const PetscScalar *barray; 1433 PetscScalar *xarray; 1434 thrust::device_ptr<const PetscScalar> bGPU; 1435 thrust::device_ptr<PetscScalar> xGPU; 1436 cusparseStatus_t stat; 1437 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1438 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1439 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1440 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1441 1442 PetscFunctionBegin; 1443 /* Analyze the matrix and create the transpose ... on the fly */ 1444 if (!loTriFactorT && !upTriFactorT) { 1445 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1446 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1447 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1448 } 1449 1450 /* Get the GPU pointers */ 1451 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1452 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1453 xGPU = thrust::device_pointer_cast(xarray); 1454 bGPU = thrust::device_pointer_cast(barray); 1455 1456 PetscCall(PetscLogGpuTimeBegin()); 1457 /* First, reorder with the row permutation */ 1458 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1459 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1460 xGPU); 1461 1462 /* First, solve U */ 1463 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1464 upTriFactorT->csrMat->num_rows, 1465 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1466 upTriFactorT->csrMat->num_entries, 1467 #endif 1468 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1469 upTriFactorT->csrMat->values->data().get(), 1470 upTriFactorT->csrMat->row_offsets->data().get(), 1471 upTriFactorT->csrMat->column_indices->data().get(), 1472 upTriFactorT->solveInfo, 1473 xarray, 1474 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1475 tempGPU->data().get(), 1476 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1477 #else 1478 tempGPU->data().get());PetscCallCUSPARSE(stat); 1479 #endif 1480 1481 /* Then, solve L */ 1482 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1483 loTriFactorT->csrMat->num_rows, 1484 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1485 loTriFactorT->csrMat->num_entries, 1486 #endif 1487 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1488 loTriFactorT->csrMat->values->data().get(), 1489 loTriFactorT->csrMat->row_offsets->data().get(), 1490 loTriFactorT->csrMat->column_indices->data().get(), 1491 loTriFactorT->solveInfo, 1492 tempGPU->data().get(), 1493 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1494 xarray, 1495 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1496 #else 1497 xarray);PetscCallCUSPARSE(stat); 1498 #endif 1499 1500 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1501 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1502 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1503 tempGPU->begin()); 1504 1505 /* Copy the temporary to the full solution. */ 1506 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1507 1508 /* restore */ 1509 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1510 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1511 PetscCall(PetscLogGpuTimeEnd()); 1512 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1513 PetscFunctionReturn(0); 1514 } 1515 1516 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1517 { 1518 const PetscScalar *barray; 1519 PetscScalar *xarray; 1520 cusparseStatus_t stat; 1521 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1522 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1523 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1524 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1525 1526 PetscFunctionBegin; 1527 /* Analyze the matrix and create the transpose ... on the fly */ 1528 if (!loTriFactorT && !upTriFactorT) { 1529 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1530 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1531 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1532 } 1533 1534 /* Get the GPU pointers */ 1535 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1536 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1537 1538 PetscCall(PetscLogGpuTimeBegin()); 1539 /* First, solve U */ 1540 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1541 upTriFactorT->csrMat->num_rows, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 upTriFactorT->csrMat->num_entries, 1544 #endif 1545 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1546 upTriFactorT->csrMat->values->data().get(), 1547 upTriFactorT->csrMat->row_offsets->data().get(), 1548 upTriFactorT->csrMat->column_indices->data().get(), 1549 upTriFactorT->solveInfo, 1550 barray, 1551 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1552 tempGPU->data().get(), 1553 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1554 #else 1555 tempGPU->data().get());PetscCallCUSPARSE(stat); 1556 #endif 1557 1558 /* Then, solve L */ 1559 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1560 loTriFactorT->csrMat->num_rows, 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 loTriFactorT->csrMat->num_entries, 1563 #endif 1564 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1565 loTriFactorT->csrMat->values->data().get(), 1566 loTriFactorT->csrMat->row_offsets->data().get(), 1567 loTriFactorT->csrMat->column_indices->data().get(), 1568 loTriFactorT->solveInfo, 1569 tempGPU->data().get(), 1570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571 xarray, 1572 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1573 #else 1574 xarray);PetscCallCUSPARSE(stat); 1575 #endif 1576 1577 /* restore */ 1578 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1579 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1580 PetscCall(PetscLogGpuTimeEnd()); 1581 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1582 PetscFunctionReturn(0); 1583 } 1584 1585 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1586 { 1587 const PetscScalar *barray; 1588 PetscScalar *xarray; 1589 thrust::device_ptr<const PetscScalar> bGPU; 1590 thrust::device_ptr<PetscScalar> xGPU; 1591 cusparseStatus_t stat; 1592 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1594 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1595 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1596 1597 PetscFunctionBegin; 1598 1599 /* Get the GPU pointers */ 1600 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1601 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1602 xGPU = thrust::device_pointer_cast(xarray); 1603 bGPU = thrust::device_pointer_cast(barray); 1604 1605 PetscCall(PetscLogGpuTimeBegin()); 1606 /* First, reorder with the row permutation */ 1607 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1608 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1609 tempGPU->begin()); 1610 1611 /* Next, solve L */ 1612 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1613 loTriFactor->csrMat->num_rows, 1614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1615 loTriFactor->csrMat->num_entries, 1616 #endif 1617 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1618 loTriFactor->csrMat->values->data().get(), 1619 loTriFactor->csrMat->row_offsets->data().get(), 1620 loTriFactor->csrMat->column_indices->data().get(), 1621 loTriFactor->solveInfo, 1622 tempGPU->data().get(), 1623 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1624 xarray, 1625 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1626 #else 1627 xarray);PetscCallCUSPARSE(stat); 1628 #endif 1629 1630 /* Then, solve U */ 1631 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1632 upTriFactor->csrMat->num_rows, 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 upTriFactor->csrMat->num_entries, 1635 #endif 1636 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1637 upTriFactor->csrMat->values->data().get(), 1638 upTriFactor->csrMat->row_offsets->data().get(), 1639 upTriFactor->csrMat->column_indices->data().get(), 1640 upTriFactor->solveInfo,xarray, 1641 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1642 tempGPU->data().get(), 1643 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1644 #else 1645 tempGPU->data().get());PetscCallCUSPARSE(stat); 1646 #endif 1647 1648 /* Last, reorder with the column permutation */ 1649 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1650 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1651 xGPU); 1652 1653 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1654 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1655 PetscCall(PetscLogGpuTimeEnd()); 1656 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1657 PetscFunctionReturn(0); 1658 } 1659 1660 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1661 { 1662 const PetscScalar *barray; 1663 PetscScalar *xarray; 1664 cusparseStatus_t stat; 1665 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1666 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1667 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1668 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1669 1670 PetscFunctionBegin; 1671 /* Get the GPU pointers */ 1672 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1673 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1674 1675 PetscCall(PetscLogGpuTimeBegin()); 1676 /* First, solve L */ 1677 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1678 loTriFactor->csrMat->num_rows, 1679 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1680 loTriFactor->csrMat->num_entries, 1681 #endif 1682 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1683 loTriFactor->csrMat->values->data().get(), 1684 loTriFactor->csrMat->row_offsets->data().get(), 1685 loTriFactor->csrMat->column_indices->data().get(), 1686 loTriFactor->solveInfo, 1687 barray, 1688 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689 tempGPU->data().get(), 1690 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1691 #else 1692 tempGPU->data().get());PetscCallCUSPARSE(stat); 1693 #endif 1694 1695 /* Next, solve U */ 1696 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1697 upTriFactor->csrMat->num_rows, 1698 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1699 upTriFactor->csrMat->num_entries, 1700 #endif 1701 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1702 upTriFactor->csrMat->values->data().get(), 1703 upTriFactor->csrMat->row_offsets->data().get(), 1704 upTriFactor->csrMat->column_indices->data().get(), 1705 upTriFactor->solveInfo, 1706 tempGPU->data().get(), 1707 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1708 xarray, 1709 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1710 #else 1711 xarray);PetscCallCUSPARSE(stat); 1712 #endif 1713 1714 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1715 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1716 PetscCall(PetscLogGpuTimeEnd()); 1717 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1718 PetscFunctionReturn(0); 1719 } 1720 1721 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1722 { 1723 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1724 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1725 1726 PetscFunctionBegin; 1727 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1728 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1729 1730 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1731 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 1732 PetscCallCUDA(WaitForCUDA()); 1733 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 1734 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1735 A->offloadmask = PETSC_OFFLOAD_BOTH; 1736 } 1737 PetscFunctionReturn(0); 1738 } 1739 1740 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1741 { 1742 PetscFunctionBegin; 1743 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1744 *array = ((Mat_SeqAIJ*)A->data)->a; 1745 PetscFunctionReturn(0); 1746 } 1747 1748 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1749 { 1750 PetscFunctionBegin; 1751 A->offloadmask = PETSC_OFFLOAD_CPU; 1752 *array = NULL; 1753 PetscFunctionReturn(0); 1754 } 1755 1756 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1757 { 1758 PetscFunctionBegin; 1759 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1760 *array = ((Mat_SeqAIJ*)A->data)->a; 1761 PetscFunctionReturn(0); 1762 } 1763 1764 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1765 { 1766 PetscFunctionBegin; 1767 *array = NULL; 1768 PetscFunctionReturn(0); 1769 } 1770 1771 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1772 { 1773 PetscFunctionBegin; 1774 *array = ((Mat_SeqAIJ*)A->data)->a; 1775 PetscFunctionReturn(0); 1776 } 1777 1778 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1779 { 1780 PetscFunctionBegin; 1781 A->offloadmask = PETSC_OFFLOAD_CPU; 1782 *array = NULL; 1783 PetscFunctionReturn(0); 1784 } 1785 1786 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 1787 { 1788 Mat_SeqAIJCUSPARSE *cusp; 1789 CsrMatrix *matrix; 1790 1791 PetscFunctionBegin; 1792 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1793 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1794 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 1795 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 1796 matrix = (CsrMatrix*)cusp->mat->mat; 1797 1798 if (i) { 1799 #if !defined(PETSC_USE_64BIT_INDICES) 1800 *i = matrix->row_offsets->data().get(); 1801 #else 1802 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1803 #endif 1804 } 1805 if (j) { 1806 #if !defined(PETSC_USE_64BIT_INDICES) 1807 *j = matrix->column_indices->data().get(); 1808 #else 1809 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1810 #endif 1811 } 1812 if (a) *a = matrix->values->data().get(); 1813 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 1814 PetscFunctionReturn(0); 1815 } 1816 1817 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1818 { 1819 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1820 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1821 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1822 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1823 cusparseStatus_t stat; 1824 PetscBool both = PETSC_TRUE; 1825 1826 PetscFunctionBegin; 1827 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1828 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1829 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1830 CsrMatrix *matrix; 1831 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1832 1833 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1834 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1835 matrix->values->assign(a->a, a->a+a->nz); 1836 PetscCallCUDA(WaitForCUDA()); 1837 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 1838 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1839 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 1840 } else { 1841 PetscInt nnz; 1842 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1843 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 1844 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1845 delete cusparsestruct->workVector; 1846 delete cusparsestruct->rowoffsets_gpu; 1847 cusparsestruct->workVector = NULL; 1848 cusparsestruct->rowoffsets_gpu = NULL; 1849 try { 1850 if (a->compressedrow.use) { 1851 m = a->compressedrow.nrows; 1852 ii = a->compressedrow.i; 1853 ridx = a->compressedrow.rindex; 1854 } else { 1855 m = A->rmap->n; 1856 ii = a->i; 1857 ridx = NULL; 1858 } 1859 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1860 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1861 else nnz = a->nz; 1862 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1863 1864 /* create cusparse matrix */ 1865 cusparsestruct->nrows = m; 1866 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1867 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 1868 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 1869 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1870 1871 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 1872 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 1873 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 1874 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1875 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1876 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1877 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1878 1879 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1880 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1881 /* set the matrix */ 1882 CsrMatrix *mat= new CsrMatrix; 1883 mat->num_rows = m; 1884 mat->num_cols = A->cmap->n; 1885 mat->num_entries = nnz; 1886 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1887 mat->row_offsets->assign(ii, ii + m+1); 1888 1889 mat->column_indices = new THRUSTINTARRAY32(nnz); 1890 mat->column_indices->assign(a->j, a->j+nnz); 1891 1892 mat->values = new THRUSTARRAY(nnz); 1893 if (a->a) mat->values->assign(a->a, a->a+nnz); 1894 1895 /* assign the pointer */ 1896 matstruct->mat = mat; 1897 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1898 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1899 stat = cusparseCreateCsr(&matstruct->matDescr, 1900 mat->num_rows, mat->num_cols, mat->num_entries, 1901 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1902 mat->values->data().get(), 1903 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1904 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1905 } 1906 #endif 1907 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1908 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1909 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1910 #else 1911 CsrMatrix *mat= new CsrMatrix; 1912 mat->num_rows = m; 1913 mat->num_cols = A->cmap->n; 1914 mat->num_entries = nnz; 1915 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1916 mat->row_offsets->assign(ii, ii + m+1); 1917 1918 mat->column_indices = new THRUSTINTARRAY32(nnz); 1919 mat->column_indices->assign(a->j, a->j+nnz); 1920 1921 mat->values = new THRUSTARRAY(nnz); 1922 if (a->a) mat->values->assign(a->a, a->a+nnz); 1923 1924 cusparseHybMat_t hybMat; 1925 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1926 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1927 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1928 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1929 matstruct->descr, mat->values->data().get(), 1930 mat->row_offsets->data().get(), 1931 mat->column_indices->data().get(), 1932 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1933 /* assign the pointer */ 1934 matstruct->mat = hybMat; 1935 1936 if (mat) { 1937 if (mat->values) delete (THRUSTARRAY*)mat->values; 1938 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1939 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1940 delete (CsrMatrix*)mat; 1941 } 1942 #endif 1943 } 1944 1945 /* assign the compressed row indices */ 1946 if (a->compressedrow.use) { 1947 cusparsestruct->workVector = new THRUSTARRAY(m); 1948 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1949 matstruct->cprowIndices->assign(ridx,ridx+m); 1950 tmp = m; 1951 } else { 1952 cusparsestruct->workVector = NULL; 1953 matstruct->cprowIndices = NULL; 1954 tmp = 0; 1955 } 1956 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1957 1958 /* assign the pointer */ 1959 cusparsestruct->mat = matstruct; 1960 } catch(char *ex) { 1961 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1962 } 1963 PetscCallCUDA(WaitForCUDA()); 1964 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1965 cusparsestruct->nonzerostate = A->nonzerostate; 1966 } 1967 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1968 } 1969 PetscFunctionReturn(0); 1970 } 1971 1972 struct VecCUDAPlusEquals 1973 { 1974 template <typename Tuple> 1975 __host__ __device__ 1976 void operator()(Tuple t) 1977 { 1978 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1979 } 1980 }; 1981 1982 struct VecCUDAEquals 1983 { 1984 template <typename Tuple> 1985 __host__ __device__ 1986 void operator()(Tuple t) 1987 { 1988 thrust::get<1>(t) = thrust::get<0>(t); 1989 } 1990 }; 1991 1992 struct VecCUDAEqualsReverse 1993 { 1994 template <typename Tuple> 1995 __host__ __device__ 1996 void operator()(Tuple t) 1997 { 1998 thrust::get<0>(t) = thrust::get<1>(t); 1999 } 2000 }; 2001 2002 struct MatMatCusparse { 2003 PetscBool cisdense; 2004 PetscScalar *Bt; 2005 Mat X; 2006 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2007 PetscLogDouble flops; 2008 CsrMatrix *Bcsr; 2009 2010 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2011 cusparseSpMatDescr_t matSpBDescr; 2012 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2013 cusparseDnMatDescr_t matBDescr; 2014 cusparseDnMatDescr_t matCDescr; 2015 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2016 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2017 void *dBuffer4; 2018 void *dBuffer5; 2019 #endif 2020 size_t mmBufferSize; 2021 void *mmBuffer; 2022 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2023 cusparseSpGEMMDescr_t spgemmDesc; 2024 #endif 2025 }; 2026 2027 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2028 { 2029 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2030 2031 PetscFunctionBegin; 2032 PetscCallCUDA(cudaFree(mmdata->Bt)); 2033 delete mmdata->Bcsr; 2034 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2035 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2036 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2037 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2038 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2039 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2040 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2041 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2042 #endif 2043 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2044 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2045 #endif 2046 PetscCall(MatDestroy(&mmdata->X)); 2047 PetscCall(PetscFree(data)); 2048 PetscFunctionReturn(0); 2049 } 2050 2051 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2052 2053 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2054 { 2055 Mat_Product *product = C->product; 2056 Mat A,B; 2057 PetscInt m,n,blda,clda; 2058 PetscBool flg,biscuda; 2059 Mat_SeqAIJCUSPARSE *cusp; 2060 cusparseStatus_t stat; 2061 cusparseOperation_t opA; 2062 const PetscScalar *barray; 2063 PetscScalar *carray; 2064 MatMatCusparse *mmdata; 2065 Mat_SeqAIJCUSPARSEMultStruct *mat; 2066 CsrMatrix *csrmat; 2067 2068 PetscFunctionBegin; 2069 MatCheckProduct(C,1); 2070 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2071 mmdata = (MatMatCusparse*)product->data; 2072 A = product->A; 2073 B = product->B; 2074 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2075 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2076 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2077 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2078 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2079 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2080 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2081 switch (product->type) { 2082 case MATPRODUCT_AB: 2083 case MATPRODUCT_PtAP: 2084 mat = cusp->mat; 2085 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2086 m = A->rmap->n; 2087 n = B->cmap->n; 2088 break; 2089 case MATPRODUCT_AtB: 2090 if (!A->form_explicit_transpose) { 2091 mat = cusp->mat; 2092 opA = CUSPARSE_OPERATION_TRANSPOSE; 2093 } else { 2094 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2095 mat = cusp->matTranspose; 2096 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2097 } 2098 m = A->cmap->n; 2099 n = B->cmap->n; 2100 break; 2101 case MATPRODUCT_ABt: 2102 case MATPRODUCT_RARt: 2103 mat = cusp->mat; 2104 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2105 m = A->rmap->n; 2106 n = B->rmap->n; 2107 break; 2108 default: 2109 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2110 } 2111 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2112 csrmat = (CsrMatrix*)mat->mat; 2113 /* if the user passed a CPU matrix, copy the data to the GPU */ 2114 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2115 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2116 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2117 2118 PetscCall(MatDenseGetLDA(B,&blda)); 2119 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2120 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2121 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2122 } else { 2123 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2124 PetscCall(MatDenseGetLDA(C,&clda)); 2125 } 2126 2127 PetscCall(PetscLogGpuTimeBegin()); 2128 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2129 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2130 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2131 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2132 size_t mmBufferSize; 2133 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2134 if (!mmdata->matBDescr) { 2135 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2136 mmdata->Blda = blda; 2137 } 2138 2139 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2140 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2141 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2142 mmdata->Clda = clda; 2143 } 2144 2145 if (!mat->matDescr) { 2146 stat = cusparseCreateCsr(&mat->matDescr, 2147 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2148 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2149 csrmat->values->data().get(), 2150 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2151 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2152 } 2153 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2154 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2155 mmdata->matCDescr,cusparse_scalartype, 2156 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2157 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2158 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2159 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2160 mmdata->mmBufferSize = mmBufferSize; 2161 } 2162 mmdata->initialized = PETSC_TRUE; 2163 } else { 2164 /* to be safe, always update pointers of the mats */ 2165 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2166 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2167 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2168 } 2169 2170 /* do cusparseSpMM, which supports transpose on B */ 2171 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2172 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2173 mmdata->matCDescr,cusparse_scalartype, 2174 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2175 #else 2176 PetscInt k; 2177 /* cusparseXcsrmm does not support transpose on B */ 2178 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2179 cublasHandle_t cublasv2handle; 2180 cublasStatus_t cerr; 2181 2182 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2183 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2184 B->cmap->n,B->rmap->n, 2185 &PETSC_CUSPARSE_ONE ,barray,blda, 2186 &PETSC_CUSPARSE_ZERO,barray,blda, 2187 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2188 blda = B->cmap->n; 2189 k = B->cmap->n; 2190 } else { 2191 k = B->rmap->n; 2192 } 2193 2194 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2195 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2196 csrmat->num_entries,mat->alpha_one,mat->descr, 2197 csrmat->values->data().get(), 2198 csrmat->row_offsets->data().get(), 2199 csrmat->column_indices->data().get(), 2200 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2201 carray,clda);PetscCallCUSPARSE(stat); 2202 #endif 2203 PetscCall(PetscLogGpuTimeEnd()); 2204 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2205 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2206 if (product->type == MATPRODUCT_RARt) { 2207 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2208 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2209 } else if (product->type == MATPRODUCT_PtAP) { 2210 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2211 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2212 } else { 2213 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2214 } 2215 if (mmdata->cisdense) { 2216 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2217 } 2218 if (!biscuda) { 2219 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2220 } 2221 PetscFunctionReturn(0); 2222 } 2223 2224 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2225 { 2226 Mat_Product *product = C->product; 2227 Mat A,B; 2228 PetscInt m,n; 2229 PetscBool cisdense,flg; 2230 MatMatCusparse *mmdata; 2231 Mat_SeqAIJCUSPARSE *cusp; 2232 2233 PetscFunctionBegin; 2234 MatCheckProduct(C,1); 2235 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2236 A = product->A; 2237 B = product->B; 2238 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2239 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2240 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2241 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2242 switch (product->type) { 2243 case MATPRODUCT_AB: 2244 m = A->rmap->n; 2245 n = B->cmap->n; 2246 break; 2247 case MATPRODUCT_AtB: 2248 m = A->cmap->n; 2249 n = B->cmap->n; 2250 break; 2251 case MATPRODUCT_ABt: 2252 m = A->rmap->n; 2253 n = B->rmap->n; 2254 break; 2255 case MATPRODUCT_PtAP: 2256 m = B->cmap->n; 2257 n = B->cmap->n; 2258 break; 2259 case MATPRODUCT_RARt: 2260 m = B->rmap->n; 2261 n = B->rmap->n; 2262 break; 2263 default: 2264 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2265 } 2266 PetscCall(MatSetSizes(C,m,n,m,n)); 2267 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2268 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 2269 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2270 2271 /* product data */ 2272 PetscCall(PetscNew(&mmdata)); 2273 mmdata->cisdense = cisdense; 2274 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2275 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2276 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2277 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2278 } 2279 #endif 2280 /* for these products we need intermediate storage */ 2281 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2282 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 2283 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2284 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2285 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2286 } else { 2287 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2288 } 2289 } 2290 C->product->data = mmdata; 2291 C->product->destroy = MatDestroy_MatMatCusparse; 2292 2293 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2294 PetscFunctionReturn(0); 2295 } 2296 2297 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2298 { 2299 Mat_Product *product = C->product; 2300 Mat A,B; 2301 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2302 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2303 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2304 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2305 PetscBool flg; 2306 cusparseStatus_t stat; 2307 MatProductType ptype; 2308 MatMatCusparse *mmdata; 2309 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2310 cusparseSpMatDescr_t BmatSpDescr; 2311 #endif 2312 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2313 2314 PetscFunctionBegin; 2315 MatCheckProduct(C,1); 2316 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2317 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 2318 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2319 mmdata = (MatMatCusparse*)C->product->data; 2320 A = product->A; 2321 B = product->B; 2322 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2323 mmdata->reusesym = PETSC_FALSE; 2324 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2325 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2326 Cmat = Ccusp->mat; 2327 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2328 Ccsr = (CsrMatrix*)Cmat->mat; 2329 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2330 goto finalize; 2331 } 2332 if (!c->nz) goto finalize; 2333 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2334 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2335 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2336 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2337 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2338 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2339 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2340 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2341 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2342 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2343 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2344 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2345 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2346 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2347 2348 ptype = product->type; 2349 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2350 ptype = MATPRODUCT_AB; 2351 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2352 } 2353 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2354 ptype = MATPRODUCT_AB; 2355 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2356 } 2357 switch (ptype) { 2358 case MATPRODUCT_AB: 2359 Amat = Acusp->mat; 2360 Bmat = Bcusp->mat; 2361 break; 2362 case MATPRODUCT_AtB: 2363 Amat = Acusp->matTranspose; 2364 Bmat = Bcusp->mat; 2365 break; 2366 case MATPRODUCT_ABt: 2367 Amat = Acusp->mat; 2368 Bmat = Bcusp->matTranspose; 2369 break; 2370 default: 2371 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2372 } 2373 Cmat = Ccusp->mat; 2374 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2375 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2376 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2377 Acsr = (CsrMatrix*)Amat->mat; 2378 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2379 Ccsr = (CsrMatrix*)Cmat->mat; 2380 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2381 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2382 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2383 PetscCall(PetscLogGpuTimeBegin()); 2384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2385 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2386 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2387 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2388 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2389 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2390 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2391 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2392 #else 2393 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2394 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2395 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2396 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2397 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2398 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2399 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2400 #endif 2401 #else 2402 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2403 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2404 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2405 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2406 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2407 #endif 2408 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2409 PetscCallCUDA(WaitForCUDA()); 2410 PetscCall(PetscLogGpuTimeEnd()); 2411 C->offloadmask = PETSC_OFFLOAD_GPU; 2412 finalize: 2413 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2414 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 2415 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 2416 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2417 c->reallocs = 0; 2418 C->info.mallocs += 0; 2419 C->info.nz_unneeded = 0; 2420 C->assembled = C->was_assembled = PETSC_TRUE; 2421 C->num_ass++; 2422 PetscFunctionReturn(0); 2423 } 2424 2425 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2426 { 2427 Mat_Product *product = C->product; 2428 Mat A,B; 2429 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2430 Mat_SeqAIJ *a,*b,*c; 2431 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2432 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2433 PetscInt i,j,m,n,k; 2434 PetscBool flg; 2435 cusparseStatus_t stat; 2436 MatProductType ptype; 2437 MatMatCusparse *mmdata; 2438 PetscLogDouble flops; 2439 PetscBool biscompressed,ciscompressed; 2440 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2441 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2442 cusparseSpMatDescr_t BmatSpDescr; 2443 #else 2444 int cnz; 2445 #endif 2446 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2447 2448 PetscFunctionBegin; 2449 MatCheckProduct(C,1); 2450 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2451 A = product->A; 2452 B = product->B; 2453 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2454 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2455 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2456 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2457 a = (Mat_SeqAIJ*)A->data; 2458 b = (Mat_SeqAIJ*)B->data; 2459 /* product data */ 2460 PetscCall(PetscNew(&mmdata)); 2461 C->product->data = mmdata; 2462 C->product->destroy = MatDestroy_MatMatCusparse; 2463 2464 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2465 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2466 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2467 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2468 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2469 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2470 2471 ptype = product->type; 2472 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2473 ptype = MATPRODUCT_AB; 2474 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2475 } 2476 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2477 ptype = MATPRODUCT_AB; 2478 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2479 } 2480 biscompressed = PETSC_FALSE; 2481 ciscompressed = PETSC_FALSE; 2482 switch (ptype) { 2483 case MATPRODUCT_AB: 2484 m = A->rmap->n; 2485 n = B->cmap->n; 2486 k = A->cmap->n; 2487 Amat = Acusp->mat; 2488 Bmat = Bcusp->mat; 2489 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2490 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2491 break; 2492 case MATPRODUCT_AtB: 2493 m = A->cmap->n; 2494 n = B->cmap->n; 2495 k = A->rmap->n; 2496 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2497 Amat = Acusp->matTranspose; 2498 Bmat = Bcusp->mat; 2499 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2500 break; 2501 case MATPRODUCT_ABt: 2502 m = A->rmap->n; 2503 n = B->rmap->n; 2504 k = A->cmap->n; 2505 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2506 Amat = Acusp->mat; 2507 Bmat = Bcusp->matTranspose; 2508 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2509 break; 2510 default: 2511 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2512 } 2513 2514 /* create cusparse matrix */ 2515 PetscCall(MatSetSizes(C,m,n,m,n)); 2516 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2517 c = (Mat_SeqAIJ*)C->data; 2518 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2519 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2520 Ccsr = new CsrMatrix; 2521 2522 c->compressedrow.use = ciscompressed; 2523 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2524 c->compressedrow.nrows = a->compressedrow.nrows; 2525 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 2526 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2527 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2528 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2529 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2530 } else { 2531 c->compressedrow.nrows = 0; 2532 c->compressedrow.i = NULL; 2533 c->compressedrow.rindex = NULL; 2534 Ccusp->workVector = NULL; 2535 Cmat->cprowIndices = NULL; 2536 } 2537 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2538 Ccusp->mat = Cmat; 2539 Ccusp->mat->mat = Ccsr; 2540 Ccsr->num_rows = Ccusp->nrows; 2541 Ccsr->num_cols = n; 2542 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2543 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2544 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2545 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2546 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 2547 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 2548 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2549 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2550 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2551 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2552 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2553 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2554 c->nz = 0; 2555 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2556 Ccsr->values = new THRUSTARRAY(c->nz); 2557 goto finalizesym; 2558 } 2559 2560 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2561 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2562 Acsr = (CsrMatrix*)Amat->mat; 2563 if (!biscompressed) { 2564 Bcsr = (CsrMatrix*)Bmat->mat; 2565 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2566 BmatSpDescr = Bmat->matDescr; 2567 #endif 2568 } else { /* we need to use row offsets for the full matrix */ 2569 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2570 Bcsr = new CsrMatrix; 2571 Bcsr->num_rows = B->rmap->n; 2572 Bcsr->num_cols = cBcsr->num_cols; 2573 Bcsr->num_entries = cBcsr->num_entries; 2574 Bcsr->column_indices = cBcsr->column_indices; 2575 Bcsr->values = cBcsr->values; 2576 if (!Bcusp->rowoffsets_gpu) { 2577 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2578 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2579 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2580 } 2581 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2582 mmdata->Bcsr = Bcsr; 2583 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2584 if (Bcsr->num_rows && Bcsr->num_cols) { 2585 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2586 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2587 Bcsr->values->data().get(), 2588 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2589 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2590 } 2591 BmatSpDescr = mmdata->matSpBDescr; 2592 #endif 2593 } 2594 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2595 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2596 /* precompute flops count */ 2597 if (ptype == MATPRODUCT_AB) { 2598 for (i=0, flops = 0; i<A->rmap->n; i++) { 2599 const PetscInt st = a->i[i]; 2600 const PetscInt en = a->i[i+1]; 2601 for (j=st; j<en; j++) { 2602 const PetscInt brow = a->j[j]; 2603 flops += 2.*(b->i[brow+1] - b->i[brow]); 2604 } 2605 } 2606 } else if (ptype == MATPRODUCT_AtB) { 2607 for (i=0, flops = 0; i<A->rmap->n; i++) { 2608 const PetscInt anzi = a->i[i+1] - a->i[i]; 2609 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2610 flops += (2.*anzi)*bnzi; 2611 } 2612 } else { /* TODO */ 2613 flops = 0.; 2614 } 2615 2616 mmdata->flops = flops; 2617 PetscCall(PetscLogGpuTimeBegin()); 2618 2619 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2620 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2621 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2622 NULL, NULL, NULL, 2623 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2624 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2625 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2626 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2627 { 2628 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2629 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2630 */ 2631 void* dBuffer1 = NULL; 2632 void* dBuffer2 = NULL; 2633 void* dBuffer3 = NULL; 2634 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2635 size_t bufferSize1 = 0; 2636 size_t bufferSize2 = 0; 2637 size_t bufferSize3 = 0; 2638 size_t bufferSize4 = 0; 2639 size_t bufferSize5 = 0; 2640 2641 /*----------------------------------------------------------------------*/ 2642 /* ask bufferSize1 bytes for external memory */ 2643 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2644 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2645 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 2646 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2647 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2648 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2649 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2650 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2651 2652 /*----------------------------------------------------------------------*/ 2653 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2654 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2655 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 2656 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 2657 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 2658 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2659 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2660 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2661 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 2662 PetscCallCUDA(cudaFree(dBuffer1)); 2663 PetscCallCUDA(cudaFree(dBuffer2)); 2664 2665 /*----------------------------------------------------------------------*/ 2666 /* get matrix C non-zero entries C_nnz1 */ 2667 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2668 c->nz = (PetscInt) C_nnz1; 2669 /* allocate matrix C */ 2670 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2671 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2672 /* update matC with the new pointers */ 2673 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2674 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2675 2676 /*----------------------------------------------------------------------*/ 2677 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2678 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2679 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 2680 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2681 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2682 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2683 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 2684 PetscCallCUDA(cudaFree(dBuffer3)); 2685 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2686 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2687 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2688 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2689 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2690 } 2691 #else 2692 size_t bufSize2; 2693 /* ask bufferSize bytes for external memory */ 2694 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2695 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2696 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2697 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 2698 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2699 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2700 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2701 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2702 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2703 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2704 /* ask bufferSize again bytes for external memory */ 2705 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2706 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2707 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2708 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2709 /* The CUSPARSE documentation is not clear, nor the API 2710 We need both buffers to perform the operations properly! 2711 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2712 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2713 is stored in the descriptor! What a messy API... */ 2714 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2715 /* compute the intermediate product of A * B */ 2716 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2717 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2718 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2719 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2720 /* get matrix C non-zero entries C_nnz1 */ 2721 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2722 c->nz = (PetscInt) C_nnz1; 2723 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2724 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2725 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2726 Ccsr->values = new THRUSTARRAY(c->nz); 2727 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2728 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2729 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2730 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2731 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2732 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2733 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2734 #else 2735 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2736 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2737 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2738 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2739 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2740 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2741 c->nz = cnz; 2742 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2743 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2744 Ccsr->values = new THRUSTARRAY(c->nz); 2745 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2746 2747 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2748 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2749 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2750 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2751 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2752 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2753 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2754 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2755 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2756 #endif 2757 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2758 PetscCall(PetscLogGpuTimeEnd()); 2759 finalizesym: 2760 c->singlemalloc = PETSC_FALSE; 2761 c->free_a = PETSC_TRUE; 2762 c->free_ij = PETSC_TRUE; 2763 PetscCall(PetscMalloc1(m+1,&c->i)); 2764 PetscCall(PetscMalloc1(c->nz,&c->j)); 2765 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2766 PetscInt *d_i = c->i; 2767 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2768 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2769 ii = *Ccsr->row_offsets; 2770 jj = *Ccsr->column_indices; 2771 if (ciscompressed) d_i = c->compressedrow.i; 2772 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2773 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2774 } else { 2775 PetscInt *d_i = c->i; 2776 if (ciscompressed) d_i = c->compressedrow.i; 2777 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2778 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2779 } 2780 if (ciscompressed) { /* need to expand host row offsets */ 2781 PetscInt r = 0; 2782 c->i[0] = 0; 2783 for (k = 0; k < c->compressedrow.nrows; k++) { 2784 const PetscInt next = c->compressedrow.rindex[k]; 2785 const PetscInt old = c->compressedrow.i[k]; 2786 for (; r < next; r++) c->i[r+1] = old; 2787 } 2788 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2789 } 2790 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 2791 PetscCall(PetscMalloc1(m,&c->ilen)); 2792 PetscCall(PetscMalloc1(m,&c->imax)); 2793 c->maxnz = c->nz; 2794 c->nonzerorowcnt = 0; 2795 c->rmax = 0; 2796 for (k = 0; k < m; k++) { 2797 const PetscInt nn = c->i[k+1] - c->i[k]; 2798 c->ilen[k] = c->imax[k] = nn; 2799 c->nonzerorowcnt += (PetscInt)!!nn; 2800 c->rmax = PetscMax(c->rmax,nn); 2801 } 2802 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 2803 PetscCall(PetscMalloc1(c->nz,&c->a)); 2804 Ccsr->num_entries = c->nz; 2805 2806 C->nonzerostate++; 2807 PetscCall(PetscLayoutSetUp(C->rmap)); 2808 PetscCall(PetscLayoutSetUp(C->cmap)); 2809 Ccusp->nonzerostate = C->nonzerostate; 2810 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2811 C->preallocated = PETSC_TRUE; 2812 C->assembled = PETSC_FALSE; 2813 C->was_assembled = PETSC_FALSE; 2814 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2815 mmdata->reusesym = PETSC_TRUE; 2816 C->offloadmask = PETSC_OFFLOAD_GPU; 2817 } 2818 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2819 PetscFunctionReturn(0); 2820 } 2821 2822 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2823 2824 /* handles sparse or dense B */ 2825 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2826 { 2827 Mat_Product *product = mat->product; 2828 PetscErrorCode ierr; 2829 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2830 2831 PetscFunctionBegin; 2832 MatCheckProduct(mat,1); 2833 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2834 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2835 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2836 } 2837 if (product->type == MATPRODUCT_ABC) { 2838 Ciscusp = PETSC_FALSE; 2839 if (!product->C->boundtocpu) { 2840 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2841 } 2842 } 2843 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2844 PetscBool usecpu = PETSC_FALSE; 2845 switch (product->type) { 2846 case MATPRODUCT_AB: 2847 if (product->api_user) { 2848 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");PetscCall(ierr); 2849 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2850 ierr = PetscOptionsEnd();PetscCall(ierr); 2851 } else { 2852 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");PetscCall(ierr); 2853 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2854 ierr = PetscOptionsEnd();PetscCall(ierr); 2855 } 2856 break; 2857 case MATPRODUCT_AtB: 2858 if (product->api_user) { 2859 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");PetscCall(ierr); 2860 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2861 ierr = PetscOptionsEnd();PetscCall(ierr); 2862 } else { 2863 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");PetscCall(ierr); 2864 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2865 ierr = PetscOptionsEnd();PetscCall(ierr); 2866 } 2867 break; 2868 case MATPRODUCT_PtAP: 2869 if (product->api_user) { 2870 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");PetscCall(ierr); 2871 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2872 ierr = PetscOptionsEnd();PetscCall(ierr); 2873 } else { 2874 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");PetscCall(ierr); 2875 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2876 ierr = PetscOptionsEnd();PetscCall(ierr); 2877 } 2878 break; 2879 case MATPRODUCT_RARt: 2880 if (product->api_user) { 2881 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");PetscCall(ierr); 2882 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2883 ierr = PetscOptionsEnd();PetscCall(ierr); 2884 } else { 2885 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");PetscCall(ierr); 2886 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2887 ierr = PetscOptionsEnd();PetscCall(ierr); 2888 } 2889 break; 2890 case MATPRODUCT_ABC: 2891 if (product->api_user) { 2892 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");PetscCall(ierr); 2893 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2894 ierr = PetscOptionsEnd();PetscCall(ierr); 2895 } else { 2896 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");PetscCall(ierr); 2897 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2898 ierr = PetscOptionsEnd();PetscCall(ierr); 2899 } 2900 break; 2901 default: 2902 break; 2903 } 2904 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2905 } 2906 /* dispatch */ 2907 if (isdense) { 2908 switch (product->type) { 2909 case MATPRODUCT_AB: 2910 case MATPRODUCT_AtB: 2911 case MATPRODUCT_ABt: 2912 case MATPRODUCT_PtAP: 2913 case MATPRODUCT_RARt: 2914 if (product->A->boundtocpu) { 2915 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2916 } else { 2917 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2918 } 2919 break; 2920 case MATPRODUCT_ABC: 2921 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2922 break; 2923 default: 2924 break; 2925 } 2926 } else if (Biscusp && Ciscusp) { 2927 switch (product->type) { 2928 case MATPRODUCT_AB: 2929 case MATPRODUCT_AtB: 2930 case MATPRODUCT_ABt: 2931 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2932 break; 2933 case MATPRODUCT_PtAP: 2934 case MATPRODUCT_RARt: 2935 case MATPRODUCT_ABC: 2936 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2937 break; 2938 default: 2939 break; 2940 } 2941 } else { /* fallback for AIJ */ 2942 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2943 } 2944 PetscFunctionReturn(0); 2945 } 2946 2947 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2948 { 2949 PetscFunctionBegin; 2950 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2951 PetscFunctionReturn(0); 2952 } 2953 2954 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2955 { 2956 PetscFunctionBegin; 2957 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2958 PetscFunctionReturn(0); 2959 } 2960 2961 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2962 { 2963 PetscFunctionBegin; 2964 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2965 PetscFunctionReturn(0); 2966 } 2967 2968 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2969 { 2970 PetscFunctionBegin; 2971 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 2972 PetscFunctionReturn(0); 2973 } 2974 2975 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2976 { 2977 PetscFunctionBegin; 2978 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2979 PetscFunctionReturn(0); 2980 } 2981 2982 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2983 { 2984 int i = blockIdx.x*blockDim.x + threadIdx.x; 2985 if (i < n) y[idx[i]] += x[i]; 2986 } 2987 2988 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2989 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2990 { 2991 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2992 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2993 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2994 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2995 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2996 PetscBool compressed; 2997 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2998 PetscInt nx,ny; 2999 #endif 3000 3001 PetscFunctionBegin; 3002 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3003 if (!a->nz) { 3004 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3005 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3006 PetscFunctionReturn(0); 3007 } 3008 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3009 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3010 if (!trans) { 3011 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3012 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3013 } else { 3014 if (herm || !A->form_explicit_transpose) { 3015 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3016 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3017 } else { 3018 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3019 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3020 } 3021 } 3022 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3023 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3024 3025 try { 3026 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3027 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3028 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3029 3030 PetscCall(PetscLogGpuTimeBegin()); 3031 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3032 /* z = A x + beta y. 3033 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3034 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3035 */ 3036 xptr = xarray; 3037 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3038 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3039 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3040 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3041 allocated to accommodate different uses. So we get the length info directly from mat. 3042 */ 3043 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3044 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3045 nx = mat->num_cols; 3046 ny = mat->num_rows; 3047 } 3048 #endif 3049 } else { 3050 /* z = A^T x + beta y 3051 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3052 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3053 */ 3054 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3055 dptr = zarray; 3056 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3057 if (compressed) { /* Scatter x to work vector */ 3058 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3059 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3060 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3061 VecCUDAEqualsReverse()); 3062 } 3063 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3064 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3065 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3066 nx = mat->num_rows; 3067 ny = mat->num_cols; 3068 } 3069 #endif 3070 } 3071 3072 /* csr_spmv does y = alpha op(A) x + beta y */ 3073 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3074 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3075 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3076 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3077 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3078 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3079 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3080 matstruct->matDescr, 3081 matstruct->cuSpMV[opA].vecXDescr, beta, 3082 matstruct->cuSpMV[opA].vecYDescr, 3083 cusparse_scalartype, 3084 cusparsestruct->spmvAlg, 3085 &matstruct->cuSpMV[opA].spmvBufferSize)); 3086 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3087 3088 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3089 } else { 3090 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3091 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3092 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3093 } 3094 3095 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3096 matstruct->alpha_one, 3097 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3098 matstruct->cuSpMV[opA].vecXDescr, 3099 beta, 3100 matstruct->cuSpMV[opA].vecYDescr, 3101 cusparse_scalartype, 3102 cusparsestruct->spmvAlg, 3103 matstruct->cuSpMV[opA].spmvBuffer)); 3104 #else 3105 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3106 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3107 mat->num_rows, mat->num_cols, 3108 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3109 mat->values->data().get(), mat->row_offsets->data().get(), 3110 mat->column_indices->data().get(), xptr, beta, 3111 dptr)); 3112 #endif 3113 } else { 3114 if (cusparsestruct->nrows) { 3115 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3116 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3117 #else 3118 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3119 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3120 matstruct->alpha_one, matstruct->descr, hybMat, 3121 xptr, beta, 3122 dptr)); 3123 #endif 3124 } 3125 } 3126 PetscCall(PetscLogGpuTimeEnd()); 3127 3128 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3129 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3130 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3131 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3132 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3133 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3134 } 3135 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3136 PetscCall(VecSet_SeqCUDA(zz,0)); 3137 } 3138 3139 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3140 if (compressed) { 3141 PetscCall(PetscLogGpuTimeBegin()); 3142 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3143 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3144 prevent that. So I just add a ScatterAdd kernel. 3145 */ 3146 #if 0 3147 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3148 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3149 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3150 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3151 VecCUDAPlusEquals()); 3152 #else 3153 PetscInt n = matstruct->cprowIndices->size(); 3154 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3155 #endif 3156 PetscCall(PetscLogGpuTimeEnd()); 3157 } 3158 } else { 3159 if (yy && yy != zz) { 3160 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3161 } 3162 } 3163 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3164 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3165 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3166 } catch(char *ex) { 3167 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3168 } 3169 if (yy) { 3170 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3171 } else { 3172 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3173 } 3174 PetscFunctionReturn(0); 3175 } 3176 3177 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3178 { 3179 PetscFunctionBegin; 3180 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3181 PetscFunctionReturn(0); 3182 } 3183 3184 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3185 { 3186 PetscObjectState onnz = A->nonzerostate; 3187 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3188 3189 PetscFunctionBegin; 3190 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3191 if (onnz != A->nonzerostate && cusp->deviceMat) { 3192 3193 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3194 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3195 cusp->deviceMat = NULL; 3196 } 3197 PetscFunctionReturn(0); 3198 } 3199 3200 /* --------------------------------------------------------------------------------*/ 3201 /*@ 3202 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3203 (the default parallel PETSc format). This matrix will ultimately pushed down 3204 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3205 assembly performance the user should preallocate the matrix storage by setting 3206 the parameter nz (or the array nnz). By setting these parameters accurately, 3207 performance during matrix assembly can be increased by more than a factor of 50. 3208 3209 Collective 3210 3211 Input Parameters: 3212 + comm - MPI communicator, set to PETSC_COMM_SELF 3213 . m - number of rows 3214 . n - number of columns 3215 . nz - number of nonzeros per row (same for all rows) 3216 - nnz - array containing the number of nonzeros in the various rows 3217 (possibly different for each row) or NULL 3218 3219 Output Parameter: 3220 . A - the matrix 3221 3222 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3223 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3224 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3225 3226 Notes: 3227 If nnz is given then nz is ignored 3228 3229 The AIJ format (also called the Yale sparse matrix format or 3230 compressed row storage), is fully compatible with standard Fortran 77 3231 storage. That is, the stored row and column indices can begin at 3232 either one (as in Fortran) or zero. See the users' manual for details. 3233 3234 Specify the preallocated storage with either nz or nnz (not both). 3235 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3236 allocation. For large problems you MUST preallocate memory or you 3237 will get TERRIBLE performance, see the users' manual chapter on matrices. 3238 3239 By default, this format uses inodes (identical nodes) when possible, to 3240 improve numerical efficiency of matrix-vector products and solves. We 3241 search for consecutive rows with the same nonzero structure, thereby 3242 reusing matrix information to achieve increased efficiency. 3243 3244 Level: intermediate 3245 3246 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3247 @*/ 3248 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3249 { 3250 PetscFunctionBegin; 3251 PetscCall(MatCreate(comm,A)); 3252 PetscCall(MatSetSizes(*A,m,n,m,n)); 3253 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 3254 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3255 PetscFunctionReturn(0); 3256 } 3257 3258 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3259 { 3260 PetscFunctionBegin; 3261 if (A->factortype == MAT_FACTOR_NONE) { 3262 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 3263 } else { 3264 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3265 } 3266 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3267 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 3268 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 3269 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3270 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3271 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3272 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 3273 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3274 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3275 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 3276 PetscCall(MatDestroy_SeqAIJ(A)); 3277 PetscFunctionReturn(0); 3278 } 3279 3280 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3281 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3282 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3283 { 3284 PetscFunctionBegin; 3285 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 3286 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 3287 PetscFunctionReturn(0); 3288 } 3289 3290 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3291 { 3292 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3293 Mat_SeqAIJCUSPARSE *cy; 3294 Mat_SeqAIJCUSPARSE *cx; 3295 PetscScalar *ay; 3296 const PetscScalar *ax; 3297 CsrMatrix *csry,*csrx; 3298 3299 PetscFunctionBegin; 3300 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3301 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3302 if (X->ops->axpy != Y->ops->axpy) { 3303 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3304 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3305 PetscFunctionReturn(0); 3306 } 3307 /* if we are here, it means both matrices are bound to GPU */ 3308 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3309 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3310 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3311 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3312 csry = (CsrMatrix*)cy->mat->mat; 3313 csrx = (CsrMatrix*)cx->mat->mat; 3314 /* see if we can turn this into a cublas axpy */ 3315 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3316 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3317 if (eq) { 3318 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3319 } 3320 if (eq) str = SAME_NONZERO_PATTERN; 3321 } 3322 /* spgeam is buggy with one column */ 3323 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3324 3325 if (str == SUBSET_NONZERO_PATTERN) { 3326 PetscScalar b = 1.0; 3327 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3328 size_t bufferSize; 3329 void *buffer; 3330 #endif 3331 3332 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3333 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3334 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3336 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3337 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3338 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3339 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 3340 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 3341 PetscCall(PetscLogGpuTimeBegin()); 3342 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3343 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3344 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3345 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 3346 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3347 PetscCall(PetscLogGpuTimeEnd()); 3348 PetscCallCUDA(cudaFree(buffer)); 3349 #else 3350 PetscCall(PetscLogGpuTimeBegin()); 3351 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3352 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3353 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3354 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 3355 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3356 PetscCall(PetscLogGpuTimeEnd()); 3357 #endif 3358 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3359 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3360 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3361 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3362 } else if (str == SAME_NONZERO_PATTERN) { 3363 cublasHandle_t cublasv2handle; 3364 PetscBLASInt one = 1, bnz = 1; 3365 3366 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3367 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3368 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3369 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 3370 PetscCall(PetscLogGpuTimeBegin()); 3371 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 3372 PetscCall(PetscLogGpuFlops(2.0*bnz)); 3373 PetscCall(PetscLogGpuTimeEnd()); 3374 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3375 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3376 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3377 } else { 3378 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3379 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3380 } 3381 PetscFunctionReturn(0); 3382 } 3383 3384 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3385 { 3386 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3387 PetscScalar *ay; 3388 cublasHandle_t cublasv2handle; 3389 PetscBLASInt one = 1, bnz = 1; 3390 3391 PetscFunctionBegin; 3392 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3393 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3394 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 3395 PetscCall(PetscLogGpuTimeBegin()); 3396 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 3397 PetscCall(PetscLogGpuFlops(bnz)); 3398 PetscCall(PetscLogGpuTimeEnd()); 3399 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3400 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3401 PetscFunctionReturn(0); 3402 } 3403 3404 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3405 { 3406 PetscBool both = PETSC_FALSE; 3407 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3408 3409 PetscFunctionBegin; 3410 if (A->factortype == MAT_FACTOR_NONE) { 3411 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3412 if (spptr->mat) { 3413 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3414 if (matrix->values) { 3415 both = PETSC_TRUE; 3416 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3417 } 3418 } 3419 if (spptr->matTranspose) { 3420 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3421 if (matrix->values) { 3422 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3423 } 3424 } 3425 } 3426 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 3427 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3428 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3429 else A->offloadmask = PETSC_OFFLOAD_CPU; 3430 PetscFunctionReturn(0); 3431 } 3432 3433 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3434 { 3435 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3436 3437 PetscFunctionBegin; 3438 if (A->factortype != MAT_FACTOR_NONE) { 3439 A->boundtocpu = flg; 3440 PetscFunctionReturn(0); 3441 } 3442 if (flg) { 3443 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3444 3445 A->ops->scale = MatScale_SeqAIJ; 3446 A->ops->axpy = MatAXPY_SeqAIJ; 3447 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3448 A->ops->mult = MatMult_SeqAIJ; 3449 A->ops->multadd = MatMultAdd_SeqAIJ; 3450 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3451 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3452 A->ops->multhermitiantranspose = NULL; 3453 A->ops->multhermitiantransposeadd = NULL; 3454 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3455 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 3456 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3457 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3458 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3459 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3460 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3461 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ)); 3462 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3463 } else { 3464 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3465 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3466 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3467 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3468 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3469 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3470 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3471 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3472 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3473 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3474 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3475 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3476 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3477 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3478 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3479 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3480 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3481 3482 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3483 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3484 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3485 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3486 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 3487 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3488 } 3489 A->boundtocpu = flg; 3490 if (flg && a->inode.size) { 3491 a->inode.use = PETSC_TRUE; 3492 } else { 3493 a->inode.use = PETSC_FALSE; 3494 } 3495 PetscFunctionReturn(0); 3496 } 3497 3498 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3499 { 3500 Mat B; 3501 3502 PetscFunctionBegin; 3503 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3504 if (reuse == MAT_INITIAL_MATRIX) { 3505 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 3506 } else if (reuse == MAT_REUSE_MATRIX) { 3507 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 3508 } 3509 B = *newmat; 3510 3511 PetscCall(PetscFree(B->defaultvectype)); 3512 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 3513 3514 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3515 if (B->factortype == MAT_FACTOR_NONE) { 3516 Mat_SeqAIJCUSPARSE *spptr; 3517 PetscCall(PetscNew(&spptr)); 3518 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3519 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3520 spptr->format = MAT_CUSPARSE_CSR; 3521 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3522 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3523 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3524 #else 3525 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3526 #endif 3527 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3528 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3529 #endif 3530 B->spptr = spptr; 3531 } else { 3532 Mat_SeqAIJCUSPARSETriFactors *spptr; 3533 3534 PetscCall(PetscNew(&spptr)); 3535 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3536 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3537 B->spptr = spptr; 3538 } 3539 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3540 } 3541 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3542 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3543 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3544 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3545 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3546 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3547 3548 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 3549 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 3550 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3551 #if defined(PETSC_HAVE_HYPRE) 3552 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3553 #endif 3554 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3555 PetscFunctionReturn(0); 3556 } 3557 3558 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3559 { 3560 PetscFunctionBegin; 3561 PetscCall(MatCreate_SeqAIJ(B)); 3562 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 3563 PetscFunctionReturn(0); 3564 } 3565 3566 /*MC 3567 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3568 3569 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3570 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3571 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3572 3573 Options Database Keys: 3574 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3575 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3576 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3577 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3578 3579 Level: beginner 3580 3581 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3582 M*/ 3583 3584 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3585 3586 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3587 { 3588 PetscFunctionBegin; 3589 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 3590 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 3591 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 3592 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 3593 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3594 3595 PetscFunctionReturn(0); 3596 } 3597 3598 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3599 { 3600 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3601 3602 PetscFunctionBegin; 3603 if (!cusp) PetscFunctionReturn(0); 3604 delete cusp->cooPerm; 3605 delete cusp->cooPerm_a; 3606 cusp->cooPerm = NULL; 3607 cusp->cooPerm_a = NULL; 3608 if (cusp->use_extended_coo) { 3609 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3610 PetscCallCUDA(cudaFree(cusp->perm_d)); 3611 } 3612 cusp->use_extended_coo = PETSC_FALSE; 3613 PetscFunctionReturn(0); 3614 } 3615 3616 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3617 { 3618 PetscFunctionBegin; 3619 if (*cusparsestruct) { 3620 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 3621 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 3622 delete (*cusparsestruct)->workVector; 3623 delete (*cusparsestruct)->rowoffsets_gpu; 3624 delete (*cusparsestruct)->cooPerm; 3625 delete (*cusparsestruct)->cooPerm_a; 3626 delete (*cusparsestruct)->csr2csc_i; 3627 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3628 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3629 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3630 PetscCall(PetscFree(*cusparsestruct)); 3631 } 3632 PetscFunctionReturn(0); 3633 } 3634 3635 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3636 { 3637 PetscFunctionBegin; 3638 if (*mat) { 3639 delete (*mat)->values; 3640 delete (*mat)->column_indices; 3641 delete (*mat)->row_offsets; 3642 delete *mat; 3643 *mat = 0; 3644 } 3645 PetscFunctionReturn(0); 3646 } 3647 3648 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3649 { 3650 PetscFunctionBegin; 3651 if (*trifactor) { 3652 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3653 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 3654 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3655 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3656 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3657 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3658 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3659 #endif 3660 PetscCall(PetscFree(*trifactor)); 3661 } 3662 PetscFunctionReturn(0); 3663 } 3664 3665 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3666 { 3667 CsrMatrix *mat; 3668 3669 PetscFunctionBegin; 3670 if (*matstruct) { 3671 if ((*matstruct)->mat) { 3672 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3673 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3674 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3675 #else 3676 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3677 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3678 #endif 3679 } else { 3680 mat = (CsrMatrix*)(*matstruct)->mat; 3681 CsrMatrix_Destroy(&mat); 3682 } 3683 } 3684 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3685 delete (*matstruct)->cprowIndices; 3686 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3687 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3688 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3689 3690 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3691 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3692 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3693 for (int i=0; i<3; i++) { 3694 if (mdata->cuSpMV[i].initialized) { 3695 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3696 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3697 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3698 } 3699 } 3700 #endif 3701 delete *matstruct; 3702 *matstruct = NULL; 3703 } 3704 PetscFunctionReturn(0); 3705 } 3706 3707 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3708 { 3709 PetscFunctionBegin; 3710 if (*trifactors) { 3711 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 3712 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 3713 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 3714 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 3715 delete (*trifactors)->rpermIndices; 3716 delete (*trifactors)->cpermIndices; 3717 delete (*trifactors)->workVector; 3718 (*trifactors)->rpermIndices = NULL; 3719 (*trifactors)->cpermIndices = NULL; 3720 (*trifactors)->workVector = NULL; 3721 if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 3722 if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3723 (*trifactors)->init_dev_prop = PETSC_FALSE; 3724 } 3725 PetscFunctionReturn(0); 3726 } 3727 3728 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3729 { 3730 cusparseHandle_t handle; 3731 3732 PetscFunctionBegin; 3733 if (*trifactors) { 3734 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3735 if (handle = (*trifactors)->handle) { 3736 PetscCallCUSPARSE(cusparseDestroy(handle)); 3737 } 3738 PetscCall(PetscFree(*trifactors)); 3739 } 3740 PetscFunctionReturn(0); 3741 } 3742 3743 struct IJCompare 3744 { 3745 __host__ __device__ 3746 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3747 { 3748 if (t1.get<0>() < t2.get<0>()) return true; 3749 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3750 return false; 3751 } 3752 }; 3753 3754 struct IJEqual 3755 { 3756 __host__ __device__ 3757 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3758 { 3759 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3760 return true; 3761 } 3762 }; 3763 3764 struct IJDiff 3765 { 3766 __host__ __device__ 3767 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3768 { 3769 return t1 == t2 ? 0 : 1; 3770 } 3771 }; 3772 3773 struct IJSum 3774 { 3775 __host__ __device__ 3776 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3777 { 3778 return t1||t2; 3779 } 3780 }; 3781 3782 #include <thrust/iterator/discard_iterator.h> 3783 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3784 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3785 { 3786 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3787 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3788 THRUSTARRAY *cooPerm_v = NULL; 3789 thrust::device_ptr<const PetscScalar> d_v; 3790 CsrMatrix *matrix; 3791 PetscInt n; 3792 3793 PetscFunctionBegin; 3794 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3795 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3796 if (!cusp->cooPerm) { 3797 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 3798 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 3799 PetscFunctionReturn(0); 3800 } 3801 matrix = (CsrMatrix*)cusp->mat->mat; 3802 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3803 if (!v) { 3804 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3805 goto finalize; 3806 } 3807 n = cusp->cooPerm->size(); 3808 if (isCudaMem(v)) { 3809 d_v = thrust::device_pointer_cast(v); 3810 } else { 3811 cooPerm_v = new THRUSTARRAY(n); 3812 cooPerm_v->assign(v,v+n); 3813 d_v = cooPerm_v->data(); 3814 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 3815 } 3816 PetscCall(PetscLogGpuTimeBegin()); 3817 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3818 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3819 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3820 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3821 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3822 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3823 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3824 */ 3825 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3826 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3827 delete cooPerm_w; 3828 } else { 3829 /* all nonzeros in d_v[] are unique entries */ 3830 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3831 matrix->values->begin())); 3832 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3833 matrix->values->end())); 3834 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3835 } 3836 } else { 3837 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3838 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3839 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3840 } else { 3841 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3842 matrix->values->begin())); 3843 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3844 matrix->values->end())); 3845 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3846 } 3847 } 3848 PetscCall(PetscLogGpuTimeEnd()); 3849 finalize: 3850 delete cooPerm_v; 3851 A->offloadmask = PETSC_OFFLOAD_GPU; 3852 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3853 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3854 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 3855 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 3856 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3857 a->reallocs = 0; 3858 A->info.mallocs += 0; 3859 A->info.nz_unneeded = 0; 3860 A->assembled = A->was_assembled = PETSC_TRUE; 3861 A->num_ass++; 3862 PetscFunctionReturn(0); 3863 } 3864 3865 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3866 { 3867 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3868 3869 PetscFunctionBegin; 3870 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3871 if (!cusp) PetscFunctionReturn(0); 3872 if (destroy) { 3873 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3874 delete cusp->csr2csc_i; 3875 cusp->csr2csc_i = NULL; 3876 } 3877 A->transupdated = PETSC_FALSE; 3878 PetscFunctionReturn(0); 3879 } 3880 3881 #include <thrust/binary_search.h> 3882 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3883 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3884 { 3885 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3886 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3887 PetscInt cooPerm_n, nzr = 0; 3888 3889 PetscFunctionBegin; 3890 PetscCall(PetscLayoutSetUp(A->rmap)); 3891 PetscCall(PetscLayoutSetUp(A->cmap)); 3892 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3893 if (n != cooPerm_n) { 3894 delete cusp->cooPerm; 3895 delete cusp->cooPerm_a; 3896 cusp->cooPerm = NULL; 3897 cusp->cooPerm_a = NULL; 3898 } 3899 if (n) { 3900 THRUSTINTARRAY d_i(n); 3901 THRUSTINTARRAY d_j(n); 3902 THRUSTINTARRAY ii(A->rmap->n); 3903 3904 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3905 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3906 3907 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 3908 d_i.assign(coo_i,coo_i+n); 3909 d_j.assign(coo_j,coo_j+n); 3910 3911 /* Ex. 3912 n = 6 3913 coo_i = [3,3,1,4,1,4] 3914 coo_j = [3,2,2,5,2,6] 3915 */ 3916 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3917 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3918 3919 PetscCall(PetscLogGpuTimeBegin()); 3920 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3921 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3922 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3923 THRUSTINTARRAY w = d_j; 3924 3925 /* 3926 d_i = [1,1,3,3,4,4] 3927 d_j = [2,2,2,3,5,6] 3928 cooPerm = [2,4,1,0,3,5] 3929 */ 3930 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3931 3932 /* 3933 d_i = [1,3,3,4,4,x] 3934 ^ekey 3935 d_j = [2,2,3,5,6,x] 3936 ^nekye 3937 */ 3938 if (nekey == ekey) { /* all entries are unique */ 3939 delete cusp->cooPerm_a; 3940 cusp->cooPerm_a = NULL; 3941 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3942 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3943 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3944 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3945 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3946 w[0] = 0; 3947 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3948 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3949 } 3950 thrust::counting_iterator<PetscInt> search_begin(0); 3951 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3952 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3953 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3954 PetscCall(PetscLogGpuTimeEnd()); 3955 3956 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 3957 a->singlemalloc = PETSC_FALSE; 3958 a->free_a = PETSC_TRUE; 3959 a->free_ij = PETSC_TRUE; 3960 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3961 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3962 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3963 a->nz = a->maxnz = a->i[A->rmap->n]; 3964 a->rmax = 0; 3965 PetscCall(PetscMalloc1(a->nz,&a->a)); 3966 PetscCall(PetscMalloc1(a->nz,&a->j)); 3967 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3968 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 3969 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 3970 for (PetscInt i = 0; i < A->rmap->n; i++) { 3971 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3972 nzr += (PetscInt)!!(nnzr); 3973 a->ilen[i] = a->imax[i] = nnzr; 3974 a->rmax = PetscMax(a->rmax,nnzr); 3975 } 3976 a->nonzerorowcnt = nzr; 3977 A->preallocated = PETSC_TRUE; 3978 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 3979 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 3980 } else { 3981 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 3982 } 3983 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 3984 3985 /* We want to allocate the CUSPARSE struct for matvec now. 3986 The code is so convoluted now that I prefer to copy zeros */ 3987 PetscCall(PetscArrayzero(a->a,a->nz)); 3988 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 3989 A->offloadmask = PETSC_OFFLOAD_CPU; 3990 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3991 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 3992 PetscFunctionReturn(0); 3993 } 3994 3995 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3996 { 3997 Mat_SeqAIJ *seq; 3998 Mat_SeqAIJCUSPARSE *dev; 3999 PetscBool coo_basic = PETSC_TRUE; 4000 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4001 4002 PetscFunctionBegin; 4003 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 4004 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4005 if (coo_i) { 4006 PetscCall(PetscGetMemType(coo_i,&mtype)); 4007 if (PetscMemTypeHost(mtype)) { 4008 for (PetscCount k=0; k<coo_n; k++) { 4009 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4010 } 4011 } 4012 } 4013 4014 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4015 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4016 } else { 4017 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4018 mat->offloadmask = PETSC_OFFLOAD_CPU; 4019 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4020 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4021 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4022 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4023 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4024 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4025 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4026 dev->use_extended_coo = PETSC_TRUE; 4027 } 4028 PetscFunctionReturn(0); 4029 } 4030 4031 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4032 { 4033 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4034 const PetscCount grid_size = gridDim.x * blockDim.x; 4035 for (; i<nnz; i+= grid_size) { 4036 PetscScalar sum = 0.0; 4037 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4038 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4039 } 4040 } 4041 4042 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4043 { 4044 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4045 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4046 PetscCount Annz = seq->nz; 4047 PetscMemType memtype; 4048 const PetscScalar *v1 = v; 4049 PetscScalar *Aa; 4050 4051 PetscFunctionBegin; 4052 if (dev->use_extended_coo) { 4053 PetscCall(PetscGetMemType(v,&memtype)); 4054 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4055 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4056 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4057 } 4058 4059 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4060 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4061 4062 if (Annz) { 4063 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4064 PetscCallCUDA(cudaPeekAtLastError()); 4065 } 4066 4067 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4068 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4069 4070 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4071 } else { 4072 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4073 } 4074 PetscFunctionReturn(0); 4075 } 4076 4077 /*@C 4078 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4079 4080 Not collective 4081 4082 Input Parameters: 4083 + A - the matrix 4084 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4085 4086 Output Parameters: 4087 + ia - the CSR row pointers 4088 - ja - the CSR column indices 4089 4090 Level: developer 4091 4092 Notes: 4093 When compressed is true, the CSR structure does not contain empty rows 4094 4095 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4096 @*/ 4097 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4098 { 4099 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4100 CsrMatrix *csr; 4101 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4102 4103 PetscFunctionBegin; 4104 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4105 if (!i || !j) PetscFunctionReturn(0); 4106 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4107 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4108 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4109 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4110 csr = (CsrMatrix*)cusp->mat->mat; 4111 if (i) { 4112 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4113 if (!cusp->rowoffsets_gpu) { 4114 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4115 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4116 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4117 } 4118 *i = cusp->rowoffsets_gpu->data().get(); 4119 } else *i = csr->row_offsets->data().get(); 4120 } 4121 if (j) *j = csr->column_indices->data().get(); 4122 PetscFunctionReturn(0); 4123 } 4124 4125 /*@C 4126 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4127 4128 Not collective 4129 4130 Input Parameters: 4131 + A - the matrix 4132 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4133 4134 Output Parameters: 4135 + ia - the CSR row pointers 4136 - ja - the CSR column indices 4137 4138 Level: developer 4139 4140 .seealso: MatSeqAIJCUSPARSEGetIJ() 4141 @*/ 4142 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4143 { 4144 PetscFunctionBegin; 4145 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4146 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4147 if (i) *i = NULL; 4148 if (j) *j = NULL; 4149 PetscFunctionReturn(0); 4150 } 4151 4152 /*@C 4153 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4154 4155 Not Collective 4156 4157 Input Parameter: 4158 . A - a MATSEQAIJCUSPARSE matrix 4159 4160 Output Parameter: 4161 . a - pointer to the device data 4162 4163 Level: developer 4164 4165 Notes: may trigger host-device copies if up-to-date matrix data is on host 4166 4167 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4168 @*/ 4169 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4170 { 4171 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4172 CsrMatrix *csr; 4173 4174 PetscFunctionBegin; 4175 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4176 PetscValidPointer(a,2); 4177 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4178 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4179 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4180 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4181 csr = (CsrMatrix*)cusp->mat->mat; 4182 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4183 *a = csr->values->data().get(); 4184 PetscFunctionReturn(0); 4185 } 4186 4187 /*@C 4188 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4189 4190 Not Collective 4191 4192 Input Parameter: 4193 . A - a MATSEQAIJCUSPARSE matrix 4194 4195 Output Parameter: 4196 . a - pointer to the device data 4197 4198 Level: developer 4199 4200 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4201 @*/ 4202 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4203 { 4204 PetscFunctionBegin; 4205 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4206 PetscValidPointer(a,2); 4207 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4208 *a = NULL; 4209 PetscFunctionReturn(0); 4210 } 4211 4212 /*@C 4213 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4214 4215 Not Collective 4216 4217 Input Parameter: 4218 . A - a MATSEQAIJCUSPARSE matrix 4219 4220 Output Parameter: 4221 . a - pointer to the device data 4222 4223 Level: developer 4224 4225 Notes: may trigger host-device copies if up-to-date matrix data is on host 4226 4227 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4228 @*/ 4229 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4230 { 4231 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4232 CsrMatrix *csr; 4233 4234 PetscFunctionBegin; 4235 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4236 PetscValidPointer(a,2); 4237 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4238 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4239 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4240 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4241 csr = (CsrMatrix*)cusp->mat->mat; 4242 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4243 *a = csr->values->data().get(); 4244 A->offloadmask = PETSC_OFFLOAD_GPU; 4245 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4246 PetscFunctionReturn(0); 4247 } 4248 /*@C 4249 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4250 4251 Not Collective 4252 4253 Input Parameter: 4254 . A - a MATSEQAIJCUSPARSE matrix 4255 4256 Output Parameter: 4257 . a - pointer to the device data 4258 4259 Level: developer 4260 4261 .seealso: MatSeqAIJCUSPARSEGetArray() 4262 @*/ 4263 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4264 { 4265 PetscFunctionBegin; 4266 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4267 PetscValidPointer(a,2); 4268 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4269 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4270 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4271 *a = NULL; 4272 PetscFunctionReturn(0); 4273 } 4274 4275 /*@C 4276 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4277 4278 Not Collective 4279 4280 Input Parameter: 4281 . A - a MATSEQAIJCUSPARSE matrix 4282 4283 Output Parameter: 4284 . a - pointer to the device data 4285 4286 Level: developer 4287 4288 Notes: does not trigger host-device copies and flags data validity on the GPU 4289 4290 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4291 @*/ 4292 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4293 { 4294 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4295 CsrMatrix *csr; 4296 4297 PetscFunctionBegin; 4298 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4299 PetscValidPointer(a,2); 4300 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4301 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4302 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4303 csr = (CsrMatrix*)cusp->mat->mat; 4304 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4305 *a = csr->values->data().get(); 4306 A->offloadmask = PETSC_OFFLOAD_GPU; 4307 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4308 PetscFunctionReturn(0); 4309 } 4310 4311 /*@C 4312 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4313 4314 Not Collective 4315 4316 Input Parameter: 4317 . A - a MATSEQAIJCUSPARSE matrix 4318 4319 Output Parameter: 4320 . a - pointer to the device data 4321 4322 Level: developer 4323 4324 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4325 @*/ 4326 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4327 { 4328 PetscFunctionBegin; 4329 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4330 PetscValidPointer(a,2); 4331 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4332 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4333 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4334 *a = NULL; 4335 PetscFunctionReturn(0); 4336 } 4337 4338 struct IJCompare4 4339 { 4340 __host__ __device__ 4341 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4342 { 4343 if (t1.get<0>() < t2.get<0>()) return true; 4344 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4345 return false; 4346 } 4347 }; 4348 4349 struct Shift 4350 { 4351 int _shift; 4352 4353 Shift(int shift) : _shift(shift) {} 4354 __host__ __device__ 4355 inline int operator() (const int &c) 4356 { 4357 return c + _shift; 4358 } 4359 }; 4360 4361 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4362 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4363 { 4364 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4365 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4366 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4367 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4368 PetscInt Annz,Bnnz; 4369 cusparseStatus_t stat; 4370 PetscInt i,m,n,zero = 0; 4371 4372 PetscFunctionBegin; 4373 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4374 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4375 PetscValidPointer(C,4); 4376 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4377 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4378 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4379 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4380 PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4381 PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4382 if (reuse == MAT_INITIAL_MATRIX) { 4383 m = A->rmap->n; 4384 n = A->cmap->n + B->cmap->n; 4385 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 4386 PetscCall(MatSetSizes(*C,m,n,m,n)); 4387 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4388 c = (Mat_SeqAIJ*)(*C)->data; 4389 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4390 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4391 Ccsr = new CsrMatrix; 4392 Cmat->cprowIndices = NULL; 4393 c->compressedrow.use = PETSC_FALSE; 4394 c->compressedrow.nrows = 0; 4395 c->compressedrow.i = NULL; 4396 c->compressedrow.rindex = NULL; 4397 Ccusp->workVector = NULL; 4398 Ccusp->nrows = m; 4399 Ccusp->mat = Cmat; 4400 Ccusp->mat->mat = Ccsr; 4401 Ccsr->num_rows = m; 4402 Ccsr->num_cols = n; 4403 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4404 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4405 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4406 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 4407 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 4408 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4409 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4410 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4411 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4412 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4413 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4414 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4415 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4416 4417 Acsr = (CsrMatrix*)Acusp->mat->mat; 4418 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4419 Annz = (PetscInt)Acsr->column_indices->size(); 4420 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4421 c->nz = Annz + Bnnz; 4422 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4423 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4424 Ccsr->values = new THRUSTARRAY(c->nz); 4425 Ccsr->num_entries = c->nz; 4426 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4427 if (c->nz) { 4428 auto Acoo = new THRUSTINTARRAY32(Annz); 4429 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4430 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4431 THRUSTINTARRAY32 *Aroff,*Broff; 4432 4433 if (a->compressedrow.use) { /* need full row offset */ 4434 if (!Acusp->rowoffsets_gpu) { 4435 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4436 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4437 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4438 } 4439 Aroff = Acusp->rowoffsets_gpu; 4440 } else Aroff = Acsr->row_offsets; 4441 if (b->compressedrow.use) { /* need full row offset */ 4442 if (!Bcusp->rowoffsets_gpu) { 4443 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4444 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4445 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4446 } 4447 Broff = Bcusp->rowoffsets_gpu; 4448 } else Broff = Bcsr->row_offsets; 4449 PetscCall(PetscLogGpuTimeBegin()); 4450 stat = cusparseXcsr2coo(Acusp->handle, 4451 Aroff->data().get(), 4452 Annz, 4453 m, 4454 Acoo->data().get(), 4455 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4456 stat = cusparseXcsr2coo(Bcusp->handle, 4457 Broff->data().get(), 4458 Bnnz, 4459 m, 4460 Bcoo->data().get(), 4461 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4462 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4463 auto Aperm = thrust::make_constant_iterator(1); 4464 auto Bperm = thrust::make_constant_iterator(0); 4465 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4466 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4467 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4468 #else 4469 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4470 auto Bcib = Bcsr->column_indices->begin(); 4471 auto Bcie = Bcsr->column_indices->end(); 4472 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4473 #endif 4474 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4475 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4476 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4477 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4478 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4479 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4480 auto p1 = Ccusp->cooPerm->begin(); 4481 auto p2 = Ccusp->cooPerm->begin(); 4482 thrust::advance(p2,Annz); 4483 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4484 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4485 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4486 #endif 4487 auto cci = thrust::make_counting_iterator(zero); 4488 auto cce = thrust::make_counting_iterator(c->nz); 4489 #if 0 //Errors on SUMMIT cuda 11.1.0 4490 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4491 #else 4492 auto pred = thrust::identity<int>(); 4493 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4494 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4495 #endif 4496 stat = cusparseXcoo2csr(Ccusp->handle, 4497 Ccoo->data().get(), 4498 c->nz, 4499 m, 4500 Ccsr->row_offsets->data().get(), 4501 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4502 PetscCall(PetscLogGpuTimeEnd()); 4503 delete wPerm; 4504 delete Acoo; 4505 delete Bcoo; 4506 delete Ccoo; 4507 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4508 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4509 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4510 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4511 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4512 #endif 4513 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4514 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4515 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4516 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4517 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4518 CsrMatrix *CcsrT = new CsrMatrix; 4519 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4520 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4521 4522 (*C)->form_explicit_transpose = PETSC_TRUE; 4523 (*C)->transupdated = PETSC_TRUE; 4524 Ccusp->rowoffsets_gpu = NULL; 4525 CmatT->cprowIndices = NULL; 4526 CmatT->mat = CcsrT; 4527 CcsrT->num_rows = n; 4528 CcsrT->num_cols = m; 4529 CcsrT->num_entries = c->nz; 4530 4531 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4532 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4533 CcsrT->values = new THRUSTARRAY(c->nz); 4534 4535 PetscCall(PetscLogGpuTimeBegin()); 4536 auto rT = CcsrT->row_offsets->begin(); 4537 if (AT) { 4538 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4539 thrust::advance(rT,-1); 4540 } 4541 if (BT) { 4542 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4543 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4544 thrust::copy(titb,tite,rT); 4545 } 4546 auto cT = CcsrT->column_indices->begin(); 4547 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4548 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4549 auto vT = CcsrT->values->begin(); 4550 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4551 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4552 PetscCall(PetscLogGpuTimeEnd()); 4553 4554 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4555 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4556 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4557 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 4558 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 4559 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4560 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4561 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4562 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4563 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4564 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4565 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4566 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4567 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4568 #endif 4569 Ccusp->matTranspose = CmatT; 4570 } 4571 } 4572 4573 c->singlemalloc = PETSC_FALSE; 4574 c->free_a = PETSC_TRUE; 4575 c->free_ij = PETSC_TRUE; 4576 PetscCall(PetscMalloc1(m+1,&c->i)); 4577 PetscCall(PetscMalloc1(c->nz,&c->j)); 4578 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4579 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4580 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4581 ii = *Ccsr->row_offsets; 4582 jj = *Ccsr->column_indices; 4583 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4584 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4585 } else { 4586 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4587 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4588 } 4589 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 4590 PetscCall(PetscMalloc1(m,&c->ilen)); 4591 PetscCall(PetscMalloc1(m,&c->imax)); 4592 c->maxnz = c->nz; 4593 c->nonzerorowcnt = 0; 4594 c->rmax = 0; 4595 for (i = 0; i < m; i++) { 4596 const PetscInt nn = c->i[i+1] - c->i[i]; 4597 c->ilen[i] = c->imax[i] = nn; 4598 c->nonzerorowcnt += (PetscInt)!!nn; 4599 c->rmax = PetscMax(c->rmax,nn); 4600 } 4601 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4602 PetscCall(PetscMalloc1(c->nz,&c->a)); 4603 (*C)->nonzerostate++; 4604 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4605 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4606 Ccusp->nonzerostate = (*C)->nonzerostate; 4607 (*C)->preallocated = PETSC_TRUE; 4608 } else { 4609 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4610 c = (Mat_SeqAIJ*)(*C)->data; 4611 if (c->nz) { 4612 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4613 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4614 PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4615 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4616 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4617 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4618 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4619 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4620 Acsr = (CsrMatrix*)Acusp->mat->mat; 4621 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4622 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4623 PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4624 PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4625 PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4626 PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4627 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4628 auto pmid = Ccusp->cooPerm->begin(); 4629 thrust::advance(pmid,Acsr->num_entries); 4630 PetscCall(PetscLogGpuTimeBegin()); 4631 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4632 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4633 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4634 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4635 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4636 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4637 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4638 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4639 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4640 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4641 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 4642 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4643 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4644 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4645 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4646 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4647 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4648 auto vT = CcsrT->values->begin(); 4649 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4650 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4651 (*C)->transupdated = PETSC_TRUE; 4652 } 4653 PetscCall(PetscLogGpuTimeEnd()); 4654 } 4655 } 4656 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4657 (*C)->assembled = PETSC_TRUE; 4658 (*C)->was_assembled = PETSC_FALSE; 4659 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4660 PetscFunctionReturn(0); 4661 } 4662 4663 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4664 { 4665 bool dmem; 4666 const PetscScalar *av; 4667 4668 PetscFunctionBegin; 4669 dmem = isCudaMem(v); 4670 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4671 if (n && idx) { 4672 THRUSTINTARRAY widx(n); 4673 widx.assign(idx,idx+n); 4674 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4675 4676 THRUSTARRAY *w = NULL; 4677 thrust::device_ptr<PetscScalar> dv; 4678 if (dmem) { 4679 dv = thrust::device_pointer_cast(v); 4680 } else { 4681 w = new THRUSTARRAY(n); 4682 dv = w->data(); 4683 } 4684 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4685 4686 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4687 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4688 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4689 if (w) { 4690 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4691 } 4692 delete w; 4693 } else { 4694 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4695 } 4696 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4697 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4698 PetscFunctionReturn(0); 4699 } 4700