1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 96 { 97 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 98 99 PetscFunctionBegin; 100 PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 101 cusparsestruct->stream = stream; 102 CHKERRCUSPARSE(cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream)); 103 PetscFunctionReturn(0); 104 } 105 106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 107 { 108 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 109 110 PetscFunctionBegin; 111 PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 112 if (cusparsestruct->handle != handle) { 113 if (cusparsestruct->handle) { 114 CHKERRCUSPARSE(cusparseDestroy(cusparsestruct->handle)); 115 } 116 cusparsestruct->handle = handle; 117 } 118 CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 PetscBool flg; 126 127 PetscFunctionBegin; 128 CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscInt n = A->rmap->n; 157 158 PetscFunctionBegin; 159 CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B)); 160 CHKERRQ(MatSetSizes(*B,n,n,n,n)); 161 (*B)->factortype = ftype; 162 CHKERRQ(MatSetType(*B,MATSEQAIJCUSPARSE)); 163 164 if (A->boundtocpu && A->bindingpropagates) CHKERRQ(MatBindToCPU(*B,PETSC_TRUE)); 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 CHKERRQ(MatSetBlockSizesFromMats(*B,A,A)); 167 if (!A->boundtocpu) { 168 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 169 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 170 } else { 171 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 172 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 173 } 174 CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 175 CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 176 CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 177 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 178 if (!A->boundtocpu) { 179 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 180 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 181 } else { 182 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 183 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 184 } 185 CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 186 CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 187 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 188 189 CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 190 (*B)->canuseordering = PETSC_TRUE; 191 CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 192 PetscFunctionReturn(0); 193 } 194 195 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 196 { 197 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 198 199 PetscFunctionBegin; 200 switch (op) { 201 case MAT_CUSPARSE_MULT: 202 cusparsestruct->format = format; 203 break; 204 case MAT_CUSPARSE_ALL: 205 cusparsestruct->format = format; 206 break; 207 default: 208 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 209 } 210 PetscFunctionReturn(0); 211 } 212 213 /*@ 214 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 215 operation. Only the MatMult operation can use different GPU storage formats 216 for MPIAIJCUSPARSE matrices. 217 Not Collective 218 219 Input Parameters: 220 + A - Matrix of type SEQAIJCUSPARSE 221 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 222 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 223 224 Output Parameter: 225 226 Level: intermediate 227 228 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 229 @*/ 230 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 231 { 232 PetscFunctionBegin; 233 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 234 CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format))); 235 PetscFunctionReturn(0); 236 } 237 238 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 239 { 240 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 241 242 PetscFunctionBegin; 243 cusparsestruct->use_cpu_solve = use_cpu; 244 PetscFunctionReturn(0); 245 } 246 247 /*@ 248 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 249 250 Input Parameters: 251 + A - Matrix of type SEQAIJCUSPARSE 252 - use_cpu - set flag for using the built-in CPU MatSolve 253 254 Output Parameter: 255 256 Notes: 257 The cuSparse LU solver currently computes the factors with the built-in CPU method 258 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 259 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 260 261 Level: intermediate 262 263 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 264 @*/ 265 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 266 { 267 PetscFunctionBegin; 268 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 269 CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu))); 270 PetscFunctionReturn(0); 271 } 272 273 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 274 { 275 PetscFunctionBegin; 276 switch (op) { 277 case MAT_FORM_EXPLICIT_TRANSPOSE: 278 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 279 if (A->form_explicit_transpose && !flg) CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 280 A->form_explicit_transpose = flg; 281 break; 282 default: 283 CHKERRQ(MatSetOption_SeqAIJ(A,op,flg)); 284 break; 285 } 286 PetscFunctionReturn(0); 287 } 288 289 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 290 291 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 292 { 293 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 294 IS isrow = b->row,iscol = b->col; 295 PetscBool row_identity,col_identity; 296 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 297 298 PetscFunctionBegin; 299 CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 300 CHKERRQ(MatLUFactorNumeric_SeqAIJ(B,A,info)); 301 B->offloadmask = PETSC_OFFLOAD_CPU; 302 /* determine which version of MatSolve needs to be used. */ 303 CHKERRQ(ISIdentity(isrow,&row_identity)); 304 CHKERRQ(ISIdentity(iscol,&col_identity)); 305 if (row_identity && col_identity) { 306 if (!cusparsestruct->use_cpu_solve) { 307 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 308 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 309 } 310 B->ops->matsolve = NULL; 311 B->ops->matsolvetranspose = NULL; 312 } else { 313 if (!cusparsestruct->use_cpu_solve) { 314 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 315 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 316 } 317 B->ops->matsolve = NULL; 318 B->ops->matsolvetranspose = NULL; 319 } 320 321 /* get the triangular factors */ 322 if (!cusparsestruct->use_cpu_solve) { 323 CHKERRQ(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 324 } 325 PetscFunctionReturn(0); 326 } 327 328 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 329 { 330 PetscErrorCode ierr; 331 MatCUSPARSEStorageFormat format; 332 PetscBool flg; 333 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 334 335 PetscFunctionBegin; 336 CHKERRQ(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options")); 337 if (A->factortype == MAT_FACTOR_NONE) { 338 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 339 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 340 if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 341 342 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 343 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 344 if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 345 CHKERRQ(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 346 if (flg) CHKERRQ(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 347 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 348 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 349 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 350 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 351 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 352 PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 353 #else 354 PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 355 #endif 356 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 357 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 358 PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 359 360 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 361 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 362 PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 363 #endif 364 } 365 CHKERRQ(PetscOptionsTail()); 366 PetscFunctionReturn(0); 367 } 368 369 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 370 { 371 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 372 373 PetscFunctionBegin; 374 CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 375 CHKERRQ(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 376 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 377 PetscFunctionReturn(0); 378 } 379 380 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 381 { 382 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 383 384 PetscFunctionBegin; 385 CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 386 CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 387 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 388 PetscFunctionReturn(0); 389 } 390 391 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 392 { 393 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 394 395 PetscFunctionBegin; 396 CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 397 CHKERRQ(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 398 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 399 PetscFunctionReturn(0); 400 } 401 402 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 403 { 404 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 405 406 PetscFunctionBegin; 407 CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 408 CHKERRQ(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 409 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 410 PetscFunctionReturn(0); 411 } 412 413 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 414 { 415 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 416 PetscInt n = A->rmap->n; 417 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 418 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 419 const PetscInt *ai = a->i,*aj = a->j,*vi; 420 const MatScalar *aa = a->a,*v; 421 PetscInt *AiLo, *AjLo; 422 PetscInt i,nz, nzLower, offset, rowOffset; 423 424 PetscFunctionBegin; 425 if (!n) PetscFunctionReturn(0); 426 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 427 try { 428 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 429 nzLower=n+ai[n]-ai[1]; 430 if (!loTriFactor) { 431 PetscScalar *AALo; 432 433 CHKERRCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 434 435 /* Allocate Space for the lower triangular matrix */ 436 CHKERRCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 437 CHKERRCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 438 439 /* Fill the lower triangular matrix */ 440 AiLo[0] = (PetscInt) 0; 441 AiLo[n] = nzLower; 442 AjLo[0] = (PetscInt) 0; 443 AALo[0] = (MatScalar) 1.0; 444 v = aa; 445 vi = aj; 446 offset = 1; 447 rowOffset= 1; 448 for (i=1; i<n; i++) { 449 nz = ai[i+1] - ai[i]; 450 /* additional 1 for the term on the diagonal */ 451 AiLo[i] = rowOffset; 452 rowOffset += nz+1; 453 454 CHKERRQ(PetscArraycpy(&(AjLo[offset]), vi, nz)); 455 CHKERRQ(PetscArraycpy(&(AALo[offset]), v, nz)); 456 457 offset += nz; 458 AjLo[offset] = (PetscInt) i; 459 AALo[offset] = (MatScalar) 1.0; 460 offset += 1; 461 462 v += nz; 463 vi += nz; 464 } 465 466 /* allocate space for the triangular factor information */ 467 CHKERRQ(PetscNew(&loTriFactor)); 468 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 469 /* Create the matrix description */ 470 CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 471 CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 472 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 473 CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 474 #else 475 CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 476 #endif 477 CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 478 CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 479 480 /* set the operation */ 481 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 482 483 /* set the matrix */ 484 loTriFactor->csrMat = new CsrMatrix; 485 loTriFactor->csrMat->num_rows = n; 486 loTriFactor->csrMat->num_cols = n; 487 loTriFactor->csrMat->num_entries = nzLower; 488 489 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 490 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 491 492 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 493 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 494 495 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 496 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 497 498 /* Create the solve analysis information */ 499 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 500 CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 501 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 502 CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 503 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 504 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 505 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 506 &loTriFactor->solveBufferSize)); 507 CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 508 #endif 509 510 /* perform the solve analysis */ 511 CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 512 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 513 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 514 loTriFactor->csrMat->column_indices->data().get(), 515 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 516 loTriFactor->solveInfo, 517 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 518 #else 519 loTriFactor->solveInfo)); 520 #endif 521 CHKERRCUDA(WaitForCUDA()); 522 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 523 524 /* assign the pointer */ 525 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 526 loTriFactor->AA_h = AALo; 527 CHKERRCUDA(cudaFreeHost(AiLo)); 528 CHKERRCUDA(cudaFreeHost(AjLo)); 529 CHKERRQ(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 530 } else { /* update values only */ 531 if (!loTriFactor->AA_h) { 532 CHKERRCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 533 } 534 /* Fill the lower triangular matrix */ 535 loTriFactor->AA_h[0] = 1.0; 536 v = aa; 537 vi = aj; 538 offset = 1; 539 for (i=1; i<n; i++) { 540 nz = ai[i+1] - ai[i]; 541 CHKERRQ(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 542 offset += nz; 543 loTriFactor->AA_h[offset] = 1.0; 544 offset += 1; 545 v += nz; 546 } 547 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 548 CHKERRQ(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 549 } 550 } catch(char *ex) { 551 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 552 } 553 } 554 PetscFunctionReturn(0); 555 } 556 557 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 558 { 559 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 560 PetscInt n = A->rmap->n; 561 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 562 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 563 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 564 const MatScalar *aa = a->a,*v; 565 PetscInt *AiUp, *AjUp; 566 PetscInt i,nz, nzUpper, offset; 567 568 PetscFunctionBegin; 569 if (!n) PetscFunctionReturn(0); 570 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 571 try { 572 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 573 nzUpper = adiag[0]-adiag[n]; 574 if (!upTriFactor) { 575 PetscScalar *AAUp; 576 577 CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 578 579 /* Allocate Space for the upper triangular matrix */ 580 CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 581 CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 582 583 /* Fill the upper triangular matrix */ 584 AiUp[0]=(PetscInt) 0; 585 AiUp[n]=nzUpper; 586 offset = nzUpper; 587 for (i=n-1; i>=0; i--) { 588 v = aa + adiag[i+1] + 1; 589 vi = aj + adiag[i+1] + 1; 590 591 /* number of elements NOT on the diagonal */ 592 nz = adiag[i] - adiag[i+1]-1; 593 594 /* decrement the offset */ 595 offset -= (nz+1); 596 597 /* first, set the diagonal elements */ 598 AjUp[offset] = (PetscInt) i; 599 AAUp[offset] = (MatScalar)1./v[nz]; 600 AiUp[i] = AiUp[i+1] - (nz+1); 601 602 CHKERRQ(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 603 CHKERRQ(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 604 } 605 606 /* allocate space for the triangular factor information */ 607 CHKERRQ(PetscNew(&upTriFactor)); 608 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 609 610 /* Create the matrix description */ 611 CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 612 CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 614 CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 615 #else 616 CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 617 #endif 618 CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 619 CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 620 621 /* set the operation */ 622 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 623 624 /* set the matrix */ 625 upTriFactor->csrMat = new CsrMatrix; 626 upTriFactor->csrMat->num_rows = n; 627 upTriFactor->csrMat->num_cols = n; 628 upTriFactor->csrMat->num_entries = nzUpper; 629 630 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 631 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 632 633 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 634 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 635 636 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 637 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 638 639 /* Create the solve analysis information */ 640 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 641 CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 642 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 643 CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 644 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 645 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 646 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 647 &upTriFactor->solveBufferSize)); 648 CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 649 #endif 650 651 /* perform the solve analysis */ 652 CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 653 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 654 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 655 upTriFactor->csrMat->column_indices->data().get(), 656 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 657 upTriFactor->solveInfo, 658 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 659 #else 660 upTriFactor->solveInfo)); 661 #endif 662 CHKERRCUDA(WaitForCUDA()); 663 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 664 665 /* assign the pointer */ 666 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 667 upTriFactor->AA_h = AAUp; 668 CHKERRCUDA(cudaFreeHost(AiUp)); 669 CHKERRCUDA(cudaFreeHost(AjUp)); 670 CHKERRQ(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 671 } else { 672 if (!upTriFactor->AA_h) { 673 CHKERRCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 674 } 675 /* Fill the upper triangular matrix */ 676 offset = nzUpper; 677 for (i=n-1; i>=0; i--) { 678 v = aa + adiag[i+1] + 1; 679 680 /* number of elements NOT on the diagonal */ 681 nz = adiag[i] - adiag[i+1]-1; 682 683 /* decrement the offset */ 684 offset -= (nz+1); 685 686 /* first, set the diagonal elements */ 687 upTriFactor->AA_h[offset] = 1./v[nz]; 688 CHKERRQ(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 689 } 690 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 691 CHKERRQ(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 692 } 693 } catch(char *ex) { 694 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 695 } 696 } 697 PetscFunctionReturn(0); 698 } 699 700 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 701 { 702 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 703 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 704 IS isrow = a->row,iscol = a->icol; 705 PetscBool row_identity,col_identity; 706 PetscInt n = A->rmap->n; 707 708 PetscFunctionBegin; 709 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 710 CHKERRQ(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 711 CHKERRQ(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 712 713 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 714 cusparseTriFactors->nnz=a->nz; 715 716 A->offloadmask = PETSC_OFFLOAD_BOTH; 717 /* lower triangular indices */ 718 CHKERRQ(ISIdentity(isrow,&row_identity)); 719 if (!row_identity && !cusparseTriFactors->rpermIndices) { 720 const PetscInt *r; 721 722 CHKERRQ(ISGetIndices(isrow,&r)); 723 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 724 cusparseTriFactors->rpermIndices->assign(r, r+n); 725 CHKERRQ(ISRestoreIndices(isrow,&r)); 726 CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt))); 727 } 728 729 /* upper triangular indices */ 730 CHKERRQ(ISIdentity(iscol,&col_identity)); 731 if (!col_identity && !cusparseTriFactors->cpermIndices) { 732 const PetscInt *c; 733 734 CHKERRQ(ISGetIndices(iscol,&c)); 735 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 736 cusparseTriFactors->cpermIndices->assign(c, c+n); 737 CHKERRQ(ISRestoreIndices(iscol,&c)); 738 CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt))); 739 } 740 PetscFunctionReturn(0); 741 } 742 743 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 744 { 745 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 746 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 747 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 748 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 749 PetscInt *AiUp, *AjUp; 750 PetscScalar *AAUp; 751 PetscScalar *AALo; 752 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 753 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 754 const PetscInt *ai = b->i,*aj = b->j,*vj; 755 const MatScalar *aa = b->a,*v; 756 757 PetscFunctionBegin; 758 if (!n) PetscFunctionReturn(0); 759 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 760 try { 761 CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 762 CHKERRCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 763 if (!upTriFactor && !loTriFactor) { 764 /* Allocate Space for the upper triangular matrix */ 765 CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 766 CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 767 768 /* Fill the upper triangular matrix */ 769 AiUp[0]=(PetscInt) 0; 770 AiUp[n]=nzUpper; 771 offset = 0; 772 for (i=0; i<n; i++) { 773 /* set the pointers */ 774 v = aa + ai[i]; 775 vj = aj + ai[i]; 776 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 777 778 /* first, set the diagonal elements */ 779 AjUp[offset] = (PetscInt) i; 780 AAUp[offset] = (MatScalar)1.0/v[nz]; 781 AiUp[i] = offset; 782 AALo[offset] = (MatScalar)1.0/v[nz]; 783 784 offset+=1; 785 if (nz>0) { 786 CHKERRQ(PetscArraycpy(&(AjUp[offset]), vj, nz)); 787 CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz)); 788 for (j=offset; j<offset+nz; j++) { 789 AAUp[j] = -AAUp[j]; 790 AALo[j] = AAUp[j]/v[nz]; 791 } 792 offset+=nz; 793 } 794 } 795 796 /* allocate space for the triangular factor information */ 797 CHKERRQ(PetscNew(&upTriFactor)); 798 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 799 800 /* Create the matrix description */ 801 CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 802 CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 803 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 804 CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 805 #else 806 CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 807 #endif 808 CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 809 CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 810 811 /* set the matrix */ 812 upTriFactor->csrMat = new CsrMatrix; 813 upTriFactor->csrMat->num_rows = A->rmap->n; 814 upTriFactor->csrMat->num_cols = A->cmap->n; 815 upTriFactor->csrMat->num_entries = a->nz; 816 817 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 818 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 819 820 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 821 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 822 823 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 824 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 825 826 /* set the operation */ 827 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 828 829 /* Create the solve analysis information */ 830 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 831 CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 832 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 833 CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 834 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 835 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 836 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 837 &upTriFactor->solveBufferSize)); 838 CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 839 #endif 840 841 /* perform the solve analysis */ 842 CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 843 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 844 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 845 upTriFactor->csrMat->column_indices->data().get(), 846 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 847 upTriFactor->solveInfo, 848 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 849 #else 850 upTriFactor->solveInfo)); 851 #endif 852 CHKERRCUDA(WaitForCUDA()); 853 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 854 855 /* assign the pointer */ 856 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 857 858 /* allocate space for the triangular factor information */ 859 CHKERRQ(PetscNew(&loTriFactor)); 860 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 861 862 /* Create the matrix description */ 863 CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 864 CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 865 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 866 CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 867 #else 868 CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 869 #endif 870 CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 871 CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 872 873 /* set the operation */ 874 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 875 876 /* set the matrix */ 877 loTriFactor->csrMat = new CsrMatrix; 878 loTriFactor->csrMat->num_rows = A->rmap->n; 879 loTriFactor->csrMat->num_cols = A->cmap->n; 880 loTriFactor->csrMat->num_entries = a->nz; 881 882 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 883 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 884 885 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 886 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 887 888 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 889 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 890 891 /* Create the solve analysis information */ 892 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 893 CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 894 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 895 CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 896 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 897 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 898 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 899 &loTriFactor->solveBufferSize)); 900 CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 901 #endif 902 903 /* perform the solve analysis */ 904 CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 905 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 906 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 907 loTriFactor->csrMat->column_indices->data().get(), 908 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 909 loTriFactor->solveInfo, 910 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 911 #else 912 loTriFactor->solveInfo)); 913 #endif 914 CHKERRCUDA(WaitForCUDA()); 915 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 916 917 /* assign the pointer */ 918 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 919 920 CHKERRQ(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 921 CHKERRCUDA(cudaFreeHost(AiUp)); 922 CHKERRCUDA(cudaFreeHost(AjUp)); 923 } else { 924 /* Fill the upper triangular matrix */ 925 offset = 0; 926 for (i=0; i<n; i++) { 927 /* set the pointers */ 928 v = aa + ai[i]; 929 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 930 931 /* first, set the diagonal elements */ 932 AAUp[offset] = 1.0/v[nz]; 933 AALo[offset] = 1.0/v[nz]; 934 935 offset+=1; 936 if (nz>0) { 937 CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz)); 938 for (j=offset; j<offset+nz; j++) { 939 AAUp[j] = -AAUp[j]; 940 AALo[j] = AAUp[j]/v[nz]; 941 } 942 offset+=nz; 943 } 944 } 945 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 946 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 947 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 948 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 949 CHKERRQ(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 950 } 951 CHKERRCUDA(cudaFreeHost(AAUp)); 952 CHKERRCUDA(cudaFreeHost(AALo)); 953 } catch(char *ex) { 954 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 955 } 956 } 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 961 { 962 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 963 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 964 IS ip = a->row; 965 PetscBool perm_identity; 966 PetscInt n = A->rmap->n; 967 968 PetscFunctionBegin; 969 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 970 CHKERRQ(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 971 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 972 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 973 974 A->offloadmask = PETSC_OFFLOAD_BOTH; 975 976 /* lower triangular indices */ 977 CHKERRQ(ISIdentity(ip,&perm_identity)); 978 if (!perm_identity) { 979 IS iip; 980 const PetscInt *irip,*rip; 981 982 CHKERRQ(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 983 CHKERRQ(ISGetIndices(iip,&irip)); 984 CHKERRQ(ISGetIndices(ip,&rip)); 985 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 986 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 987 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 988 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 989 CHKERRQ(ISRestoreIndices(iip,&irip)); 990 CHKERRQ(ISDestroy(&iip)); 991 CHKERRQ(ISRestoreIndices(ip,&rip)); 992 CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 993 } 994 PetscFunctionReturn(0); 995 } 996 997 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 998 { 999 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1000 IS ip = b->row; 1001 PetscBool perm_identity; 1002 1003 PetscFunctionBegin; 1004 CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 1005 CHKERRQ(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 1006 B->offloadmask = PETSC_OFFLOAD_CPU; 1007 /* determine which version of MatSolve needs to be used. */ 1008 CHKERRQ(ISIdentity(ip,&perm_identity)); 1009 if (perm_identity) { 1010 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1011 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1012 B->ops->matsolve = NULL; 1013 B->ops->matsolvetranspose = NULL; 1014 } else { 1015 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1016 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1017 B->ops->matsolve = NULL; 1018 B->ops->matsolvetranspose = NULL; 1019 } 1020 1021 /* get the triangular factors */ 1022 CHKERRQ(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1023 PetscFunctionReturn(0); 1024 } 1025 1026 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1027 { 1028 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1029 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1030 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1031 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1032 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1033 cusparseIndexBase_t indexBase; 1034 cusparseMatrixType_t matrixType; 1035 cusparseFillMode_t fillMode; 1036 cusparseDiagType_t diagType; 1037 1038 PetscFunctionBegin; 1039 /* allocate space for the transpose of the lower triangular factor */ 1040 CHKERRQ(PetscNew(&loTriFactorT)); 1041 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1042 1043 /* set the matrix descriptors of the lower triangular factor */ 1044 matrixType = cusparseGetMatType(loTriFactor->descr); 1045 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1046 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1047 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1048 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1049 1050 /* Create the matrix description */ 1051 CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1052 CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1053 CHKERRCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1054 CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1055 CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1056 1057 /* set the operation */ 1058 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1059 1060 /* allocate GPU space for the CSC of the lower triangular factor*/ 1061 loTriFactorT->csrMat = new CsrMatrix; 1062 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1063 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1064 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1065 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1066 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1067 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1068 1069 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1071 CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1072 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1073 loTriFactor->csrMat->values->data().get(), 1074 loTriFactor->csrMat->row_offsets->data().get(), 1075 loTriFactor->csrMat->column_indices->data().get(), 1076 loTriFactorT->csrMat->values->data().get(), 1077 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1078 CUSPARSE_ACTION_NUMERIC,indexBase, 1079 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1080 CHKERRCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1081 #endif 1082 1083 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1084 CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1085 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1086 loTriFactor->csrMat->values->data().get(), 1087 loTriFactor->csrMat->row_offsets->data().get(), 1088 loTriFactor->csrMat->column_indices->data().get(), 1089 loTriFactorT->csrMat->values->data().get(), 1090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1091 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1092 CUSPARSE_ACTION_NUMERIC, indexBase, 1093 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1094 #else 1095 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1096 CUSPARSE_ACTION_NUMERIC, indexBase)); 1097 #endif 1098 CHKERRCUDA(WaitForCUDA()); 1099 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1100 1101 /* Create the solve analysis information */ 1102 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1103 CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 1104 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1105 CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1106 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1107 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1108 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1109 &loTriFactorT->solveBufferSize)); 1110 CHKERRCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1111 #endif 1112 1113 /* perform the solve analysis */ 1114 CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1115 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1116 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1117 loTriFactorT->csrMat->column_indices->data().get(), 1118 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1119 loTriFactorT->solveInfo, 1120 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1121 #else 1122 loTriFactorT->solveInfo)); 1123 #endif 1124 CHKERRCUDA(WaitForCUDA()); 1125 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1126 1127 /* assign the pointer */ 1128 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1129 1130 /*********************************************/ 1131 /* Now the Transpose of the Upper Tri Factor */ 1132 /*********************************************/ 1133 1134 /* allocate space for the transpose of the upper triangular factor */ 1135 CHKERRQ(PetscNew(&upTriFactorT)); 1136 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1137 1138 /* set the matrix descriptors of the upper triangular factor */ 1139 matrixType = cusparseGetMatType(upTriFactor->descr); 1140 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1141 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1142 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1143 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1144 1145 /* Create the matrix description */ 1146 CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1147 CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1148 CHKERRCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1149 CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1150 CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1151 1152 /* set the operation */ 1153 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1154 1155 /* allocate GPU space for the CSC of the upper triangular factor*/ 1156 upTriFactorT->csrMat = new CsrMatrix; 1157 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1158 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1159 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1160 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1161 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1162 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1163 1164 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1165 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1166 CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1167 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1168 upTriFactor->csrMat->values->data().get(), 1169 upTriFactor->csrMat->row_offsets->data().get(), 1170 upTriFactor->csrMat->column_indices->data().get(), 1171 upTriFactorT->csrMat->values->data().get(), 1172 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1173 CUSPARSE_ACTION_NUMERIC,indexBase, 1174 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1175 CHKERRCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1176 #endif 1177 1178 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1179 CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1180 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1181 upTriFactor->csrMat->values->data().get(), 1182 upTriFactor->csrMat->row_offsets->data().get(), 1183 upTriFactor->csrMat->column_indices->data().get(), 1184 upTriFactorT->csrMat->values->data().get(), 1185 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1186 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1187 CUSPARSE_ACTION_NUMERIC, indexBase, 1188 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1189 #else 1190 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1191 CUSPARSE_ACTION_NUMERIC, indexBase)); 1192 #endif 1193 1194 CHKERRCUDA(WaitForCUDA()); 1195 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1196 1197 /* Create the solve analysis information */ 1198 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1199 CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 1200 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1201 CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1202 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1203 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1204 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1205 &upTriFactorT->solveBufferSize)); 1206 CHKERRCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1207 #endif 1208 1209 /* perform the solve analysis */ 1210 /* christ, would it have killed you to put this stuff in a function????????? */ 1211 CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1212 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1213 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1214 upTriFactorT->csrMat->column_indices->data().get(), 1215 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1216 upTriFactorT->solveInfo, 1217 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1218 #else 1219 upTriFactorT->solveInfo)); 1220 #endif 1221 1222 CHKERRCUDA(WaitForCUDA()); 1223 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1224 1225 /* assign the pointer */ 1226 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1227 PetscFunctionReturn(0); 1228 } 1229 1230 struct PetscScalarToPetscInt 1231 { 1232 __host__ __device__ 1233 PetscInt operator()(PetscScalar s) 1234 { 1235 return (PetscInt)PetscRealPart(s); 1236 } 1237 }; 1238 1239 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1240 { 1241 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1242 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1243 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1244 cusparseStatus_t stat; 1245 cusparseIndexBase_t indexBase; 1246 1247 PetscFunctionBegin; 1248 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 1249 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1250 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1251 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1252 PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1253 if (A->transupdated) PetscFunctionReturn(0); 1254 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1255 CHKERRQ(PetscLogGpuTimeBegin()); 1256 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1257 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1258 } 1259 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1260 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1261 CHKERRCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1262 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1263 CHKERRCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1264 CHKERRCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1265 1266 /* set alpha and beta */ 1267 CHKERRCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1268 CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1269 CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1270 CHKERRCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1271 CHKERRCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1272 CHKERRCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1273 1274 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1275 CsrMatrix *matrixT = new CsrMatrix; 1276 matstructT->mat = matrixT; 1277 matrixT->num_rows = A->cmap->n; 1278 matrixT->num_cols = A->rmap->n; 1279 matrixT->num_entries = a->nz; 1280 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1281 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1282 matrixT->values = new THRUSTARRAY(a->nz); 1283 1284 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1285 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1286 1287 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1288 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1289 stat = cusparseCreateCsr(&matstructT->matDescr, 1290 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1291 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1292 matrixT->values->data().get(), 1293 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1294 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1295 #else 1296 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1297 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1298 1299 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1300 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1301 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1302 */ 1303 if (matrixT->num_entries) { 1304 stat = cusparseCreateCsr(&matstructT->matDescr, 1305 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1306 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1307 matrixT->values->data().get(), 1308 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1309 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1310 1311 } else { 1312 matstructT->matDescr = NULL; 1313 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1314 } 1315 #endif 1316 #endif 1317 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1319 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1320 #else 1321 CsrMatrix *temp = new CsrMatrix; 1322 CsrMatrix *tempT = new CsrMatrix; 1323 /* First convert HYB to CSR */ 1324 temp->num_rows = A->rmap->n; 1325 temp->num_cols = A->cmap->n; 1326 temp->num_entries = a->nz; 1327 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1328 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1329 temp->values = new THRUSTARRAY(a->nz); 1330 1331 stat = cusparse_hyb2csr(cusparsestruct->handle, 1332 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1333 temp->values->data().get(), 1334 temp->row_offsets->data().get(), 1335 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1336 1337 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1338 tempT->num_rows = A->rmap->n; 1339 tempT->num_cols = A->cmap->n; 1340 tempT->num_entries = a->nz; 1341 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1342 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1343 tempT->values = new THRUSTARRAY(a->nz); 1344 1345 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1346 temp->num_cols, temp->num_entries, 1347 temp->values->data().get(), 1348 temp->row_offsets->data().get(), 1349 temp->column_indices->data().get(), 1350 tempT->values->data().get(), 1351 tempT->column_indices->data().get(), 1352 tempT->row_offsets->data().get(), 1353 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1354 1355 /* Last, convert CSC to HYB */ 1356 cusparseHybMat_t hybMat; 1357 CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat)); 1358 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1359 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1360 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1361 matstructT->descr, tempT->values->data().get(), 1362 tempT->row_offsets->data().get(), 1363 tempT->column_indices->data().get(), 1364 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1365 1366 /* assign the pointer */ 1367 matstructT->mat = hybMat; 1368 A->transupdated = PETSC_TRUE; 1369 /* delete temporaries */ 1370 if (tempT) { 1371 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1372 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1373 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1374 delete (CsrMatrix*) tempT; 1375 } 1376 if (temp) { 1377 if (temp->values) delete (THRUSTARRAY*) temp->values; 1378 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1379 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1380 delete (CsrMatrix*) temp; 1381 } 1382 #endif 1383 } 1384 } 1385 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1386 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1387 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1388 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1389 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1390 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1391 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1392 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1393 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1394 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1395 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1396 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1397 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1398 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1399 CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1400 } 1401 if (!cusparsestruct->csr2csc_i) { 1402 THRUSTARRAY csr2csc_a(matrix->num_entries); 1403 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1404 1405 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1406 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1407 void *csr2cscBuffer; 1408 size_t csr2cscBufferSize; 1409 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1410 A->cmap->n, matrix->num_entries, 1411 matrix->values->data().get(), 1412 cusparsestruct->rowoffsets_gpu->data().get(), 1413 matrix->column_indices->data().get(), 1414 matrixT->values->data().get(), 1415 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1416 CUSPARSE_ACTION_NUMERIC,indexBase, 1417 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1418 CHKERRCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1419 #endif 1420 1421 if (matrix->num_entries) { 1422 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1423 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1424 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1425 1426 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1427 should be filled with indexBase. So I just take a shortcut here. 1428 */ 1429 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1430 A->cmap->n,matrix->num_entries, 1431 csr2csc_a.data().get(), 1432 cusparsestruct->rowoffsets_gpu->data().get(), 1433 matrix->column_indices->data().get(), 1434 matrixT->values->data().get(), 1435 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1436 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1437 CUSPARSE_ACTION_NUMERIC,indexBase, 1438 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1439 #else 1440 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1441 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1442 #endif 1443 } else { 1444 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1445 } 1446 1447 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1448 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1449 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1450 CHKERRCUDA(cudaFree(csr2cscBuffer)); 1451 #endif 1452 } 1453 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1454 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1455 matrixT->values->begin())); 1456 } 1457 CHKERRQ(PetscLogGpuTimeEnd()); 1458 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1459 /* the compressed row indices is not used for matTranspose */ 1460 matstructT->cprowIndices = NULL; 1461 /* assign the pointer */ 1462 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1463 A->transupdated = PETSC_TRUE; 1464 PetscFunctionReturn(0); 1465 } 1466 1467 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1468 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1469 { 1470 PetscInt n = xx->map->n; 1471 const PetscScalar *barray; 1472 PetscScalar *xarray; 1473 thrust::device_ptr<const PetscScalar> bGPU; 1474 thrust::device_ptr<PetscScalar> xGPU; 1475 cusparseStatus_t stat; 1476 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1477 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1478 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1479 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1480 1481 PetscFunctionBegin; 1482 /* Analyze the matrix and create the transpose ... on the fly */ 1483 if (!loTriFactorT && !upTriFactorT) { 1484 CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1485 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1486 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1487 } 1488 1489 /* Get the GPU pointers */ 1490 CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray)); 1491 CHKERRQ(VecCUDAGetArrayRead(bb,&barray)); 1492 xGPU = thrust::device_pointer_cast(xarray); 1493 bGPU = thrust::device_pointer_cast(barray); 1494 1495 CHKERRQ(PetscLogGpuTimeBegin()); 1496 /* First, reorder with the row permutation */ 1497 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1498 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1499 xGPU); 1500 1501 /* First, solve U */ 1502 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1503 upTriFactorT->csrMat->num_rows, 1504 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1505 upTriFactorT->csrMat->num_entries, 1506 #endif 1507 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1508 upTriFactorT->csrMat->values->data().get(), 1509 upTriFactorT->csrMat->row_offsets->data().get(), 1510 upTriFactorT->csrMat->column_indices->data().get(), 1511 upTriFactorT->solveInfo, 1512 xarray, 1513 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1514 tempGPU->data().get(), 1515 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1516 #else 1517 tempGPU->data().get());CHKERRCUSPARSE(stat); 1518 #endif 1519 1520 /* Then, solve L */ 1521 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1522 loTriFactorT->csrMat->num_rows, 1523 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1524 loTriFactorT->csrMat->num_entries, 1525 #endif 1526 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1527 loTriFactorT->csrMat->values->data().get(), 1528 loTriFactorT->csrMat->row_offsets->data().get(), 1529 loTriFactorT->csrMat->column_indices->data().get(), 1530 loTriFactorT->solveInfo, 1531 tempGPU->data().get(), 1532 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533 xarray, 1534 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1535 #else 1536 xarray);CHKERRCUSPARSE(stat); 1537 #endif 1538 1539 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1540 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1541 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1542 tempGPU->begin()); 1543 1544 /* Copy the temporary to the full solution. */ 1545 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1546 1547 /* restore */ 1548 CHKERRQ(VecCUDARestoreArrayRead(bb,&barray)); 1549 CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray)); 1550 CHKERRQ(PetscLogGpuTimeEnd()); 1551 CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1552 PetscFunctionReturn(0); 1553 } 1554 1555 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1556 { 1557 const PetscScalar *barray; 1558 PetscScalar *xarray; 1559 cusparseStatus_t stat; 1560 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1561 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1562 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1563 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1564 1565 PetscFunctionBegin; 1566 /* Analyze the matrix and create the transpose ... on the fly */ 1567 if (!loTriFactorT && !upTriFactorT) { 1568 CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1569 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1570 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1571 } 1572 1573 /* Get the GPU pointers */ 1574 CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray)); 1575 CHKERRQ(VecCUDAGetArrayRead(bb,&barray)); 1576 1577 CHKERRQ(PetscLogGpuTimeBegin()); 1578 /* First, solve U */ 1579 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1580 upTriFactorT->csrMat->num_rows, 1581 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1582 upTriFactorT->csrMat->num_entries, 1583 #endif 1584 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1585 upTriFactorT->csrMat->values->data().get(), 1586 upTriFactorT->csrMat->row_offsets->data().get(), 1587 upTriFactorT->csrMat->column_indices->data().get(), 1588 upTriFactorT->solveInfo, 1589 barray, 1590 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1591 tempGPU->data().get(), 1592 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1593 #else 1594 tempGPU->data().get());CHKERRCUSPARSE(stat); 1595 #endif 1596 1597 /* Then, solve L */ 1598 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1599 loTriFactorT->csrMat->num_rows, 1600 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1601 loTriFactorT->csrMat->num_entries, 1602 #endif 1603 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1604 loTriFactorT->csrMat->values->data().get(), 1605 loTriFactorT->csrMat->row_offsets->data().get(), 1606 loTriFactorT->csrMat->column_indices->data().get(), 1607 loTriFactorT->solveInfo, 1608 tempGPU->data().get(), 1609 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1610 xarray, 1611 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1612 #else 1613 xarray);CHKERRCUSPARSE(stat); 1614 #endif 1615 1616 /* restore */ 1617 CHKERRQ(VecCUDARestoreArrayRead(bb,&barray)); 1618 CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray)); 1619 CHKERRQ(PetscLogGpuTimeEnd()); 1620 CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1621 PetscFunctionReturn(0); 1622 } 1623 1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1625 { 1626 const PetscScalar *barray; 1627 PetscScalar *xarray; 1628 thrust::device_ptr<const PetscScalar> bGPU; 1629 thrust::device_ptr<PetscScalar> xGPU; 1630 cusparseStatus_t stat; 1631 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1633 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1634 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1635 1636 PetscFunctionBegin; 1637 1638 /* Get the GPU pointers */ 1639 CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray)); 1640 CHKERRQ(VecCUDAGetArrayRead(bb,&barray)); 1641 xGPU = thrust::device_pointer_cast(xarray); 1642 bGPU = thrust::device_pointer_cast(barray); 1643 1644 CHKERRQ(PetscLogGpuTimeBegin()); 1645 /* First, reorder with the row permutation */ 1646 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1647 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1648 tempGPU->begin()); 1649 1650 /* Next, solve L */ 1651 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1652 loTriFactor->csrMat->num_rows, 1653 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1654 loTriFactor->csrMat->num_entries, 1655 #endif 1656 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1657 loTriFactor->csrMat->values->data().get(), 1658 loTriFactor->csrMat->row_offsets->data().get(), 1659 loTriFactor->csrMat->column_indices->data().get(), 1660 loTriFactor->solveInfo, 1661 tempGPU->data().get(), 1662 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1663 xarray, 1664 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1665 #else 1666 xarray);CHKERRCUSPARSE(stat); 1667 #endif 1668 1669 /* Then, solve U */ 1670 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1671 upTriFactor->csrMat->num_rows, 1672 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1673 upTriFactor->csrMat->num_entries, 1674 #endif 1675 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1676 upTriFactor->csrMat->values->data().get(), 1677 upTriFactor->csrMat->row_offsets->data().get(), 1678 upTriFactor->csrMat->column_indices->data().get(), 1679 upTriFactor->solveInfo,xarray, 1680 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1681 tempGPU->data().get(), 1682 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1683 #else 1684 tempGPU->data().get());CHKERRCUSPARSE(stat); 1685 #endif 1686 1687 /* Last, reorder with the column permutation */ 1688 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1689 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1690 xGPU); 1691 1692 CHKERRQ(VecCUDARestoreArrayRead(bb,&barray)); 1693 CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray)); 1694 CHKERRQ(PetscLogGpuTimeEnd()); 1695 CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1696 PetscFunctionReturn(0); 1697 } 1698 1699 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1700 { 1701 const PetscScalar *barray; 1702 PetscScalar *xarray; 1703 cusparseStatus_t stat; 1704 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1705 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1706 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1707 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1708 1709 PetscFunctionBegin; 1710 /* Get the GPU pointers */ 1711 CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray)); 1712 CHKERRQ(VecCUDAGetArrayRead(bb,&barray)); 1713 1714 CHKERRQ(PetscLogGpuTimeBegin()); 1715 /* First, solve L */ 1716 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1717 loTriFactor->csrMat->num_rows, 1718 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1719 loTriFactor->csrMat->num_entries, 1720 #endif 1721 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1722 loTriFactor->csrMat->values->data().get(), 1723 loTriFactor->csrMat->row_offsets->data().get(), 1724 loTriFactor->csrMat->column_indices->data().get(), 1725 loTriFactor->solveInfo, 1726 barray, 1727 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1728 tempGPU->data().get(), 1729 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1730 #else 1731 tempGPU->data().get());CHKERRCUSPARSE(stat); 1732 #endif 1733 1734 /* Next, solve U */ 1735 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1736 upTriFactor->csrMat->num_rows, 1737 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1738 upTriFactor->csrMat->num_entries, 1739 #endif 1740 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1741 upTriFactor->csrMat->values->data().get(), 1742 upTriFactor->csrMat->row_offsets->data().get(), 1743 upTriFactor->csrMat->column_indices->data().get(), 1744 upTriFactor->solveInfo, 1745 tempGPU->data().get(), 1746 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1747 xarray, 1748 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1749 #else 1750 xarray);CHKERRCUSPARSE(stat); 1751 #endif 1752 1753 CHKERRQ(VecCUDARestoreArrayRead(bb,&barray)); 1754 CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray)); 1755 CHKERRQ(PetscLogGpuTimeEnd()); 1756 CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1757 PetscFunctionReturn(0); 1758 } 1759 1760 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1761 { 1762 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1763 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1764 1765 PetscFunctionBegin; 1766 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1767 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1768 1769 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1770 CHKERRCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 1771 CHKERRCUDA(WaitForCUDA()); 1772 CHKERRQ(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 1773 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1774 A->offloadmask = PETSC_OFFLOAD_BOTH; 1775 } 1776 PetscFunctionReturn(0); 1777 } 1778 1779 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1780 { 1781 PetscFunctionBegin; 1782 CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 1783 *array = ((Mat_SeqAIJ*)A->data)->a; 1784 PetscFunctionReturn(0); 1785 } 1786 1787 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1788 { 1789 PetscFunctionBegin; 1790 A->offloadmask = PETSC_OFFLOAD_CPU; 1791 *array = NULL; 1792 PetscFunctionReturn(0); 1793 } 1794 1795 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1796 { 1797 PetscFunctionBegin; 1798 CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 1799 *array = ((Mat_SeqAIJ*)A->data)->a; 1800 PetscFunctionReturn(0); 1801 } 1802 1803 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1804 { 1805 PetscFunctionBegin; 1806 *array = NULL; 1807 PetscFunctionReturn(0); 1808 } 1809 1810 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1811 { 1812 PetscFunctionBegin; 1813 *array = ((Mat_SeqAIJ*)A->data)->a; 1814 PetscFunctionReturn(0); 1815 } 1816 1817 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1818 { 1819 PetscFunctionBegin; 1820 A->offloadmask = PETSC_OFFLOAD_CPU; 1821 *array = NULL; 1822 PetscFunctionReturn(0); 1823 } 1824 1825 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1826 { 1827 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1828 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1829 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1830 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1831 cusparseStatus_t stat; 1832 PetscBool both = PETSC_TRUE; 1833 1834 PetscFunctionBegin; 1835 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1836 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1837 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1838 CsrMatrix *matrix; 1839 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1840 1841 PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1842 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1843 matrix->values->assign(a->a, a->a+a->nz); 1844 CHKERRCUDA(WaitForCUDA()); 1845 CHKERRQ(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 1846 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1847 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 1848 } else { 1849 PetscInt nnz; 1850 CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1851 CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 1852 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1853 delete cusparsestruct->workVector; 1854 delete cusparsestruct->rowoffsets_gpu; 1855 cusparsestruct->workVector = NULL; 1856 cusparsestruct->rowoffsets_gpu = NULL; 1857 try { 1858 if (a->compressedrow.use) { 1859 m = a->compressedrow.nrows; 1860 ii = a->compressedrow.i; 1861 ridx = a->compressedrow.rindex; 1862 } else { 1863 m = A->rmap->n; 1864 ii = a->i; 1865 ridx = NULL; 1866 } 1867 PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1868 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1869 else nnz = a->nz; 1870 PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1871 1872 /* create cusparse matrix */ 1873 cusparsestruct->nrows = m; 1874 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1875 CHKERRCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 1876 CHKERRCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 1877 CHKERRCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1878 1879 CHKERRCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 1880 CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 1881 CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 1882 CHKERRCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1883 CHKERRCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1884 CHKERRCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1885 CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1886 1887 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1888 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1889 /* set the matrix */ 1890 CsrMatrix *mat= new CsrMatrix; 1891 mat->num_rows = m; 1892 mat->num_cols = A->cmap->n; 1893 mat->num_entries = nnz; 1894 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1895 mat->row_offsets->assign(ii, ii + m+1); 1896 1897 mat->column_indices = new THRUSTINTARRAY32(nnz); 1898 mat->column_indices->assign(a->j, a->j+nnz); 1899 1900 mat->values = new THRUSTARRAY(nnz); 1901 if (a->a) mat->values->assign(a->a, a->a+nnz); 1902 1903 /* assign the pointer */ 1904 matstruct->mat = mat; 1905 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1906 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1907 stat = cusparseCreateCsr(&matstruct->matDescr, 1908 mat->num_rows, mat->num_cols, mat->num_entries, 1909 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1910 mat->values->data().get(), 1911 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1912 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1913 } 1914 #endif 1915 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1916 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1917 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1918 #else 1919 CsrMatrix *mat= new CsrMatrix; 1920 mat->num_rows = m; 1921 mat->num_cols = A->cmap->n; 1922 mat->num_entries = nnz; 1923 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1924 mat->row_offsets->assign(ii, ii + m+1); 1925 1926 mat->column_indices = new THRUSTINTARRAY32(nnz); 1927 mat->column_indices->assign(a->j, a->j+nnz); 1928 1929 mat->values = new THRUSTARRAY(nnz); 1930 if (a->a) mat->values->assign(a->a, a->a+nnz); 1931 1932 cusparseHybMat_t hybMat; 1933 CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat)); 1934 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1935 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1936 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1937 matstruct->descr, mat->values->data().get(), 1938 mat->row_offsets->data().get(), 1939 mat->column_indices->data().get(), 1940 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1941 /* assign the pointer */ 1942 matstruct->mat = hybMat; 1943 1944 if (mat) { 1945 if (mat->values) delete (THRUSTARRAY*)mat->values; 1946 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1947 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1948 delete (CsrMatrix*)mat; 1949 } 1950 #endif 1951 } 1952 1953 /* assign the compressed row indices */ 1954 if (a->compressedrow.use) { 1955 cusparsestruct->workVector = new THRUSTARRAY(m); 1956 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1957 matstruct->cprowIndices->assign(ridx,ridx+m); 1958 tmp = m; 1959 } else { 1960 cusparsestruct->workVector = NULL; 1961 matstruct->cprowIndices = NULL; 1962 tmp = 0; 1963 } 1964 CHKERRQ(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1965 1966 /* assign the pointer */ 1967 cusparsestruct->mat = matstruct; 1968 } catch(char *ex) { 1969 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1970 } 1971 CHKERRCUDA(WaitForCUDA()); 1972 CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1973 cusparsestruct->nonzerostate = A->nonzerostate; 1974 } 1975 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1976 } 1977 PetscFunctionReturn(0); 1978 } 1979 1980 struct VecCUDAPlusEquals 1981 { 1982 template <typename Tuple> 1983 __host__ __device__ 1984 void operator()(Tuple t) 1985 { 1986 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1987 } 1988 }; 1989 1990 struct VecCUDAEquals 1991 { 1992 template <typename Tuple> 1993 __host__ __device__ 1994 void operator()(Tuple t) 1995 { 1996 thrust::get<1>(t) = thrust::get<0>(t); 1997 } 1998 }; 1999 2000 struct VecCUDAEqualsReverse 2001 { 2002 template <typename Tuple> 2003 __host__ __device__ 2004 void operator()(Tuple t) 2005 { 2006 thrust::get<0>(t) = thrust::get<1>(t); 2007 } 2008 }; 2009 2010 struct MatMatCusparse { 2011 PetscBool cisdense; 2012 PetscScalar *Bt; 2013 Mat X; 2014 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2015 PetscLogDouble flops; 2016 CsrMatrix *Bcsr; 2017 2018 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2019 cusparseSpMatDescr_t matSpBDescr; 2020 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2021 cusparseDnMatDescr_t matBDescr; 2022 cusparseDnMatDescr_t matCDescr; 2023 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2024 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2025 void *dBuffer4; 2026 void *dBuffer5; 2027 #endif 2028 size_t mmBufferSize; 2029 void *mmBuffer; 2030 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2031 cusparseSpGEMMDescr_t spgemmDesc; 2032 #endif 2033 }; 2034 2035 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2036 { 2037 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2038 2039 PetscFunctionBegin; 2040 CHKERRCUDA(cudaFree(mmdata->Bt)); 2041 delete mmdata->Bcsr; 2042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2043 if (mmdata->matSpBDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2044 if (mmdata->matBDescr) CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2045 if (mmdata->matCDescr) CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2046 if (mmdata->spgemmDesc) CHKERRCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2047 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2048 if (mmdata->dBuffer4) CHKERRCUDA(cudaFree(mmdata->dBuffer4)); 2049 if (mmdata->dBuffer5) CHKERRCUDA(cudaFree(mmdata->dBuffer5)); 2050 #endif 2051 if (mmdata->mmBuffer) CHKERRCUDA(cudaFree(mmdata->mmBuffer)); 2052 if (mmdata->mmBuffer2) CHKERRCUDA(cudaFree(mmdata->mmBuffer2)); 2053 #endif 2054 CHKERRQ(MatDestroy(&mmdata->X)); 2055 CHKERRQ(PetscFree(data)); 2056 PetscFunctionReturn(0); 2057 } 2058 2059 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2060 2061 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2062 { 2063 Mat_Product *product = C->product; 2064 Mat A,B; 2065 PetscInt m,n,blda,clda; 2066 PetscBool flg,biscuda; 2067 Mat_SeqAIJCUSPARSE *cusp; 2068 cusparseStatus_t stat; 2069 cusparseOperation_t opA; 2070 const PetscScalar *barray; 2071 PetscScalar *carray; 2072 MatMatCusparse *mmdata; 2073 Mat_SeqAIJCUSPARSEMultStruct *mat; 2074 CsrMatrix *csrmat; 2075 2076 PetscFunctionBegin; 2077 MatCheckProduct(C,1); 2078 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2079 mmdata = (MatMatCusparse*)product->data; 2080 A = product->A; 2081 B = product->B; 2082 CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2083 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2084 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2085 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2086 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2087 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 2088 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2089 switch (product->type) { 2090 case MATPRODUCT_AB: 2091 case MATPRODUCT_PtAP: 2092 mat = cusp->mat; 2093 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2094 m = A->rmap->n; 2095 n = B->cmap->n; 2096 break; 2097 case MATPRODUCT_AtB: 2098 if (!A->form_explicit_transpose) { 2099 mat = cusp->mat; 2100 opA = CUSPARSE_OPERATION_TRANSPOSE; 2101 } else { 2102 CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2103 mat = cusp->matTranspose; 2104 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2105 } 2106 m = A->cmap->n; 2107 n = B->cmap->n; 2108 break; 2109 case MATPRODUCT_ABt: 2110 case MATPRODUCT_RARt: 2111 mat = cusp->mat; 2112 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2113 m = A->rmap->n; 2114 n = B->rmap->n; 2115 break; 2116 default: 2117 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2118 } 2119 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2120 csrmat = (CsrMatrix*)mat->mat; 2121 /* if the user passed a CPU matrix, copy the data to the GPU */ 2122 CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2123 if (!biscuda) CHKERRQ(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2124 CHKERRQ(MatDenseCUDAGetArrayRead(B,&barray)); 2125 2126 CHKERRQ(MatDenseGetLDA(B,&blda)); 2127 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2128 CHKERRQ(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2129 CHKERRQ(MatDenseGetLDA(mmdata->X,&clda)); 2130 } else { 2131 CHKERRQ(MatDenseCUDAGetArrayWrite(C,&carray)); 2132 CHKERRQ(MatDenseGetLDA(C,&clda)); 2133 } 2134 2135 CHKERRQ(PetscLogGpuTimeBegin()); 2136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2137 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2138 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2139 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2140 size_t mmBufferSize; 2141 if (mmdata->initialized && mmdata->Blda != blda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2142 if (!mmdata->matBDescr) { 2143 CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2144 mmdata->Blda = blda; 2145 } 2146 2147 if (mmdata->initialized && mmdata->Clda != clda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2148 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2149 CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2150 mmdata->Clda = clda; 2151 } 2152 2153 if (!mat->matDescr) { 2154 stat = cusparseCreateCsr(&mat->matDescr, 2155 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2156 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2157 csrmat->values->data().get(), 2158 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2159 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2160 } 2161 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2162 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2163 mmdata->matCDescr,cusparse_scalartype, 2164 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2165 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2166 CHKERRCUDA(cudaFree(mmdata->mmBuffer)); 2167 CHKERRCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2168 mmdata->mmBufferSize = mmBufferSize; 2169 } 2170 mmdata->initialized = PETSC_TRUE; 2171 } else { 2172 /* to be safe, always update pointers of the mats */ 2173 CHKERRCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2174 CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2175 CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2176 } 2177 2178 /* do cusparseSpMM, which supports transpose on B */ 2179 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2180 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2181 mmdata->matCDescr,cusparse_scalartype, 2182 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2183 #else 2184 PetscInt k; 2185 /* cusparseXcsrmm does not support transpose on B */ 2186 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2187 cublasHandle_t cublasv2handle; 2188 cublasStatus_t cerr; 2189 2190 CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle)); 2191 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2192 B->cmap->n,B->rmap->n, 2193 &PETSC_CUSPARSE_ONE ,barray,blda, 2194 &PETSC_CUSPARSE_ZERO,barray,blda, 2195 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2196 blda = B->cmap->n; 2197 k = B->cmap->n; 2198 } else { 2199 k = B->rmap->n; 2200 } 2201 2202 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2203 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2204 csrmat->num_entries,mat->alpha_one,mat->descr, 2205 csrmat->values->data().get(), 2206 csrmat->row_offsets->data().get(), 2207 csrmat->column_indices->data().get(), 2208 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2209 carray,clda);CHKERRCUSPARSE(stat); 2210 #endif 2211 CHKERRQ(PetscLogGpuTimeEnd()); 2212 CHKERRQ(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2213 CHKERRQ(MatDenseCUDARestoreArrayRead(B,&barray)); 2214 if (product->type == MATPRODUCT_RARt) { 2215 CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2216 CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2217 } else if (product->type == MATPRODUCT_PtAP) { 2218 CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2219 CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2220 } else { 2221 CHKERRQ(MatDenseCUDARestoreArrayWrite(C,&carray)); 2222 } 2223 if (mmdata->cisdense) { 2224 CHKERRQ(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2225 } 2226 if (!biscuda) { 2227 CHKERRQ(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2228 } 2229 PetscFunctionReturn(0); 2230 } 2231 2232 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2233 { 2234 Mat_Product *product = C->product; 2235 Mat A,B; 2236 PetscInt m,n; 2237 PetscBool cisdense,flg; 2238 MatMatCusparse *mmdata; 2239 Mat_SeqAIJCUSPARSE *cusp; 2240 2241 PetscFunctionBegin; 2242 MatCheckProduct(C,1); 2243 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2244 A = product->A; 2245 B = product->B; 2246 CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2247 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2248 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2249 PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2250 switch (product->type) { 2251 case MATPRODUCT_AB: 2252 m = A->rmap->n; 2253 n = B->cmap->n; 2254 break; 2255 case MATPRODUCT_AtB: 2256 m = A->cmap->n; 2257 n = B->cmap->n; 2258 break; 2259 case MATPRODUCT_ABt: 2260 m = A->rmap->n; 2261 n = B->rmap->n; 2262 break; 2263 case MATPRODUCT_PtAP: 2264 m = B->cmap->n; 2265 n = B->cmap->n; 2266 break; 2267 case MATPRODUCT_RARt: 2268 m = B->rmap->n; 2269 n = B->rmap->n; 2270 break; 2271 default: 2272 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2273 } 2274 CHKERRQ(MatSetSizes(C,m,n,m,n)); 2275 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2276 CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 2277 CHKERRQ(MatSetType(C,MATSEQDENSECUDA)); 2278 2279 /* product data */ 2280 CHKERRQ(PetscNew(&mmdata)); 2281 mmdata->cisdense = cisdense; 2282 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2283 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2284 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2285 CHKERRCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2286 } 2287 #endif 2288 /* for these products we need intermediate storage */ 2289 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2290 CHKERRQ(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 2291 CHKERRQ(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2292 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2293 CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2294 } else { 2295 CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2296 } 2297 } 2298 C->product->data = mmdata; 2299 C->product->destroy = MatDestroy_MatMatCusparse; 2300 2301 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2302 PetscFunctionReturn(0); 2303 } 2304 2305 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2306 { 2307 Mat_Product *product = C->product; 2308 Mat A,B; 2309 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2310 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2311 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2312 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2313 PetscBool flg; 2314 cusparseStatus_t stat; 2315 MatProductType ptype; 2316 MatMatCusparse *mmdata; 2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2318 cusparseSpMatDescr_t BmatSpDescr; 2319 #endif 2320 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2321 2322 PetscFunctionBegin; 2323 MatCheckProduct(C,1); 2324 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2325 CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 2326 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2327 mmdata = (MatMatCusparse*)C->product->data; 2328 A = product->A; 2329 B = product->B; 2330 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2331 mmdata->reusesym = PETSC_FALSE; 2332 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2333 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2334 Cmat = Ccusp->mat; 2335 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2336 Ccsr = (CsrMatrix*)Cmat->mat; 2337 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2338 goto finalize; 2339 } 2340 if (!c->nz) goto finalize; 2341 CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2342 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2343 CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2344 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2345 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2346 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2347 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2348 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2349 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2350 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2351 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2352 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2353 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 2354 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B)); 2355 2356 ptype = product->type; 2357 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2358 ptype = MATPRODUCT_AB; 2359 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2360 } 2361 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2362 ptype = MATPRODUCT_AB; 2363 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2364 } 2365 switch (ptype) { 2366 case MATPRODUCT_AB: 2367 Amat = Acusp->mat; 2368 Bmat = Bcusp->mat; 2369 break; 2370 case MATPRODUCT_AtB: 2371 Amat = Acusp->matTranspose; 2372 Bmat = Bcusp->mat; 2373 break; 2374 case MATPRODUCT_ABt: 2375 Amat = Acusp->mat; 2376 Bmat = Bcusp->matTranspose; 2377 break; 2378 default: 2379 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2380 } 2381 Cmat = Ccusp->mat; 2382 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2383 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2384 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2385 Acsr = (CsrMatrix*)Amat->mat; 2386 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2387 Ccsr = (CsrMatrix*)Cmat->mat; 2388 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2389 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2390 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2391 CHKERRQ(PetscLogGpuTimeBegin()); 2392 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2393 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2394 CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2395 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2396 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2397 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2398 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2399 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2400 #else 2401 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2402 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2403 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2404 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2405 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2406 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2407 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2408 #endif 2409 #else 2410 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2411 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2412 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2413 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2414 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2415 #endif 2416 CHKERRQ(PetscLogGpuFlops(mmdata->flops)); 2417 CHKERRCUDA(WaitForCUDA()); 2418 CHKERRQ(PetscLogGpuTimeEnd()); 2419 C->offloadmask = PETSC_OFFLOAD_GPU; 2420 finalize: 2421 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2422 CHKERRQ(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 2423 CHKERRQ(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 2424 CHKERRQ(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2425 c->reallocs = 0; 2426 C->info.mallocs += 0; 2427 C->info.nz_unneeded = 0; 2428 C->assembled = C->was_assembled = PETSC_TRUE; 2429 C->num_ass++; 2430 PetscFunctionReturn(0); 2431 } 2432 2433 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2434 { 2435 Mat_Product *product = C->product; 2436 Mat A,B; 2437 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2438 Mat_SeqAIJ *a,*b,*c; 2439 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2440 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2441 PetscInt i,j,m,n,k; 2442 PetscBool flg; 2443 cusparseStatus_t stat; 2444 MatProductType ptype; 2445 MatMatCusparse *mmdata; 2446 PetscLogDouble flops; 2447 PetscBool biscompressed,ciscompressed; 2448 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2449 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2450 cusparseSpMatDescr_t BmatSpDescr; 2451 #else 2452 int cnz; 2453 #endif 2454 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2455 2456 PetscFunctionBegin; 2457 MatCheckProduct(C,1); 2458 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2459 A = product->A; 2460 B = product->B; 2461 CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2462 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2463 CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2464 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2465 a = (Mat_SeqAIJ*)A->data; 2466 b = (Mat_SeqAIJ*)B->data; 2467 /* product data */ 2468 CHKERRQ(PetscNew(&mmdata)); 2469 C->product->data = mmdata; 2470 C->product->destroy = MatDestroy_MatMatCusparse; 2471 2472 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 2473 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B)); 2474 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2475 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2476 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2477 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2478 2479 ptype = product->type; 2480 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2481 ptype = MATPRODUCT_AB; 2482 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2483 } 2484 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2485 ptype = MATPRODUCT_AB; 2486 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2487 } 2488 biscompressed = PETSC_FALSE; 2489 ciscompressed = PETSC_FALSE; 2490 switch (ptype) { 2491 case MATPRODUCT_AB: 2492 m = A->rmap->n; 2493 n = B->cmap->n; 2494 k = A->cmap->n; 2495 Amat = Acusp->mat; 2496 Bmat = Bcusp->mat; 2497 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2498 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2499 break; 2500 case MATPRODUCT_AtB: 2501 m = A->cmap->n; 2502 n = B->cmap->n; 2503 k = A->rmap->n; 2504 CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2505 Amat = Acusp->matTranspose; 2506 Bmat = Bcusp->mat; 2507 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2508 break; 2509 case MATPRODUCT_ABt: 2510 m = A->rmap->n; 2511 n = B->rmap->n; 2512 k = A->cmap->n; 2513 CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2514 Amat = Acusp->mat; 2515 Bmat = Bcusp->matTranspose; 2516 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2517 break; 2518 default: 2519 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2520 } 2521 2522 /* create cusparse matrix */ 2523 CHKERRQ(MatSetSizes(C,m,n,m,n)); 2524 CHKERRQ(MatSetType(C,MATSEQAIJCUSPARSE)); 2525 c = (Mat_SeqAIJ*)C->data; 2526 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2527 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2528 Ccsr = new CsrMatrix; 2529 2530 c->compressedrow.use = ciscompressed; 2531 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2532 c->compressedrow.nrows = a->compressedrow.nrows; 2533 CHKERRQ(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 2534 CHKERRQ(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2535 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2536 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2537 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2538 } else { 2539 c->compressedrow.nrows = 0; 2540 c->compressedrow.i = NULL; 2541 c->compressedrow.rindex = NULL; 2542 Ccusp->workVector = NULL; 2543 Cmat->cprowIndices = NULL; 2544 } 2545 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2546 Ccusp->mat = Cmat; 2547 Ccusp->mat->mat = Ccsr; 2548 Ccsr->num_rows = Ccusp->nrows; 2549 Ccsr->num_cols = n; 2550 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2551 CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2552 CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2553 CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2554 CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 2555 CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 2556 CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2557 CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2558 CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2559 CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2560 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2561 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2562 c->nz = 0; 2563 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2564 Ccsr->values = new THRUSTARRAY(c->nz); 2565 goto finalizesym; 2566 } 2567 2568 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2569 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2570 Acsr = (CsrMatrix*)Amat->mat; 2571 if (!biscompressed) { 2572 Bcsr = (CsrMatrix*)Bmat->mat; 2573 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2574 BmatSpDescr = Bmat->matDescr; 2575 #endif 2576 } else { /* we need to use row offsets for the full matrix */ 2577 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2578 Bcsr = new CsrMatrix; 2579 Bcsr->num_rows = B->rmap->n; 2580 Bcsr->num_cols = cBcsr->num_cols; 2581 Bcsr->num_entries = cBcsr->num_entries; 2582 Bcsr->column_indices = cBcsr->column_indices; 2583 Bcsr->values = cBcsr->values; 2584 if (!Bcusp->rowoffsets_gpu) { 2585 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2586 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2587 CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2588 } 2589 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2590 mmdata->Bcsr = Bcsr; 2591 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2592 if (Bcsr->num_rows && Bcsr->num_cols) { 2593 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2594 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2595 Bcsr->values->data().get(), 2596 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2597 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2598 } 2599 BmatSpDescr = mmdata->matSpBDescr; 2600 #endif 2601 } 2602 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2603 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2604 /* precompute flops count */ 2605 if (ptype == MATPRODUCT_AB) { 2606 for (i=0, flops = 0; i<A->rmap->n; i++) { 2607 const PetscInt st = a->i[i]; 2608 const PetscInt en = a->i[i+1]; 2609 for (j=st; j<en; j++) { 2610 const PetscInt brow = a->j[j]; 2611 flops += 2.*(b->i[brow+1] - b->i[brow]); 2612 } 2613 } 2614 } else if (ptype == MATPRODUCT_AtB) { 2615 for (i=0, flops = 0; i<A->rmap->n; i++) { 2616 const PetscInt anzi = a->i[i+1] - a->i[i]; 2617 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2618 flops += (2.*anzi)*bnzi; 2619 } 2620 } else { /* TODO */ 2621 flops = 0.; 2622 } 2623 2624 mmdata->flops = flops; 2625 CHKERRQ(PetscLogGpuTimeBegin()); 2626 2627 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2628 CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2629 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2630 NULL, NULL, NULL, 2631 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2632 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2633 CHKERRCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2634 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2635 { 2636 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2637 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2638 */ 2639 void* dBuffer1 = NULL; 2640 void* dBuffer2 = NULL; 2641 void* dBuffer3 = NULL; 2642 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2643 size_t bufferSize1 = 0; 2644 size_t bufferSize2 = 0; 2645 size_t bufferSize3 = 0; 2646 size_t bufferSize4 = 0; 2647 size_t bufferSize5 = 0; 2648 2649 /*----------------------------------------------------------------------*/ 2650 /* ask bufferSize1 bytes for external memory */ 2651 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2652 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2653 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2654 CHKERRCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2655 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2656 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2657 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2658 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2659 2660 /*----------------------------------------------------------------------*/ 2661 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2662 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2663 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2664 CHKERRCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 2665 CHKERRCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 2666 CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2667 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2668 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2669 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2670 CHKERRCUDA(cudaFree(dBuffer1)); 2671 CHKERRCUDA(cudaFree(dBuffer2)); 2672 2673 /*----------------------------------------------------------------------*/ 2674 /* get matrix C non-zero entries C_nnz1 */ 2675 CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2676 c->nz = (PetscInt) C_nnz1; 2677 /* allocate matrix C */ 2678 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2679 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2680 /* update matC with the new pointers */ 2681 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2682 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2683 2684 /*----------------------------------------------------------------------*/ 2685 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2686 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2687 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2688 CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2689 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2690 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2691 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2692 CHKERRCUDA(cudaFree(dBuffer3)); 2693 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2694 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2695 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2696 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2697 CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2698 } 2699 #else 2700 size_t bufSize2; 2701 /* ask bufferSize bytes for external memory */ 2702 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2703 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2704 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2705 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2706 CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2707 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2708 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2709 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2710 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2711 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2712 /* ask bufferSize again bytes for external memory */ 2713 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2714 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2715 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2716 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2717 /* The CUSPARSE documentation is not clear, nor the API 2718 We need both buffers to perform the operations properly! 2719 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2720 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2721 is stored in the descriptor! What a messy API... */ 2722 CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2723 /* compute the intermediate product of A * B */ 2724 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2725 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2726 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2727 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2728 /* get matrix C non-zero entries C_nnz1 */ 2729 CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2730 c->nz = (PetscInt) C_nnz1; 2731 CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2732 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2733 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2734 Ccsr->values = new THRUSTARRAY(c->nz); 2735 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2736 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2737 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2738 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2739 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2740 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2741 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2742 #else 2743 CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2744 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2745 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2746 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2747 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2748 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2749 c->nz = cnz; 2750 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2751 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2752 Ccsr->values = new THRUSTARRAY(c->nz); 2753 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2754 2755 CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2756 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2757 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2758 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2759 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2760 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2761 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2762 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2763 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2764 #endif 2765 CHKERRQ(PetscLogGpuFlops(mmdata->flops)); 2766 CHKERRQ(PetscLogGpuTimeEnd()); 2767 finalizesym: 2768 c->singlemalloc = PETSC_FALSE; 2769 c->free_a = PETSC_TRUE; 2770 c->free_ij = PETSC_TRUE; 2771 CHKERRQ(PetscMalloc1(m+1,&c->i)); 2772 CHKERRQ(PetscMalloc1(c->nz,&c->j)); 2773 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2774 PetscInt *d_i = c->i; 2775 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2776 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2777 ii = *Ccsr->row_offsets; 2778 jj = *Ccsr->column_indices; 2779 if (ciscompressed) d_i = c->compressedrow.i; 2780 CHKERRCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2781 CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2782 } else { 2783 PetscInt *d_i = c->i; 2784 if (ciscompressed) d_i = c->compressedrow.i; 2785 CHKERRCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2786 CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2787 } 2788 if (ciscompressed) { /* need to expand host row offsets */ 2789 PetscInt r = 0; 2790 c->i[0] = 0; 2791 for (k = 0; k < c->compressedrow.nrows; k++) { 2792 const PetscInt next = c->compressedrow.rindex[k]; 2793 const PetscInt old = c->compressedrow.i[k]; 2794 for (; r < next; r++) c->i[r+1] = old; 2795 } 2796 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2797 } 2798 CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 2799 CHKERRQ(PetscMalloc1(m,&c->ilen)); 2800 CHKERRQ(PetscMalloc1(m,&c->imax)); 2801 c->maxnz = c->nz; 2802 c->nonzerorowcnt = 0; 2803 c->rmax = 0; 2804 for (k = 0; k < m; k++) { 2805 const PetscInt nn = c->i[k+1] - c->i[k]; 2806 c->ilen[k] = c->imax[k] = nn; 2807 c->nonzerorowcnt += (PetscInt)!!nn; 2808 c->rmax = PetscMax(c->rmax,nn); 2809 } 2810 CHKERRQ(MatMarkDiagonal_SeqAIJ(C)); 2811 CHKERRQ(PetscMalloc1(c->nz,&c->a)); 2812 Ccsr->num_entries = c->nz; 2813 2814 C->nonzerostate++; 2815 CHKERRQ(PetscLayoutSetUp(C->rmap)); 2816 CHKERRQ(PetscLayoutSetUp(C->cmap)); 2817 Ccusp->nonzerostate = C->nonzerostate; 2818 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2819 C->preallocated = PETSC_TRUE; 2820 C->assembled = PETSC_FALSE; 2821 C->was_assembled = PETSC_FALSE; 2822 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2823 mmdata->reusesym = PETSC_TRUE; 2824 C->offloadmask = PETSC_OFFLOAD_GPU; 2825 } 2826 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2827 PetscFunctionReturn(0); 2828 } 2829 2830 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2831 2832 /* handles sparse or dense B */ 2833 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2834 { 2835 Mat_Product *product = mat->product; 2836 PetscErrorCode ierr; 2837 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2838 2839 PetscFunctionBegin; 2840 MatCheckProduct(mat,1); 2841 CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2842 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2843 CHKERRQ(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2844 } 2845 if (product->type == MATPRODUCT_ABC) { 2846 Ciscusp = PETSC_FALSE; 2847 if (!product->C->boundtocpu) { 2848 CHKERRQ(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2849 } 2850 } 2851 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2852 PetscBool usecpu = PETSC_FALSE; 2853 switch (product->type) { 2854 case MATPRODUCT_AB: 2855 if (product->api_user) { 2856 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2857 CHKERRQ(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2858 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2859 } else { 2860 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2861 CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2862 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2863 } 2864 break; 2865 case MATPRODUCT_AtB: 2866 if (product->api_user) { 2867 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2868 CHKERRQ(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2869 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2870 } else { 2871 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2872 CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2873 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2874 } 2875 break; 2876 case MATPRODUCT_PtAP: 2877 if (product->api_user) { 2878 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2879 CHKERRQ(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2880 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2881 } else { 2882 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2883 CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2884 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2885 } 2886 break; 2887 case MATPRODUCT_RARt: 2888 if (product->api_user) { 2889 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2890 CHKERRQ(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2891 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2892 } else { 2893 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2894 CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2895 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2896 } 2897 break; 2898 case MATPRODUCT_ABC: 2899 if (product->api_user) { 2900 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2901 CHKERRQ(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2902 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2903 } else { 2904 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2905 CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2906 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2907 } 2908 break; 2909 default: 2910 break; 2911 } 2912 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2913 } 2914 /* dispatch */ 2915 if (isdense) { 2916 switch (product->type) { 2917 case MATPRODUCT_AB: 2918 case MATPRODUCT_AtB: 2919 case MATPRODUCT_ABt: 2920 case MATPRODUCT_PtAP: 2921 case MATPRODUCT_RARt: 2922 if (product->A->boundtocpu) { 2923 CHKERRQ(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2924 } else { 2925 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2926 } 2927 break; 2928 case MATPRODUCT_ABC: 2929 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2930 break; 2931 default: 2932 break; 2933 } 2934 } else if (Biscusp && Ciscusp) { 2935 switch (product->type) { 2936 case MATPRODUCT_AB: 2937 case MATPRODUCT_AtB: 2938 case MATPRODUCT_ABt: 2939 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2940 break; 2941 case MATPRODUCT_PtAP: 2942 case MATPRODUCT_RARt: 2943 case MATPRODUCT_ABC: 2944 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2945 break; 2946 default: 2947 break; 2948 } 2949 } else { /* fallback for AIJ */ 2950 CHKERRQ(MatProductSetFromOptions_SeqAIJ(mat)); 2951 } 2952 PetscFunctionReturn(0); 2953 } 2954 2955 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2956 { 2957 PetscFunctionBegin; 2958 CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2959 PetscFunctionReturn(0); 2960 } 2961 2962 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2963 { 2964 PetscFunctionBegin; 2965 CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2966 PetscFunctionReturn(0); 2967 } 2968 2969 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2970 { 2971 PetscFunctionBegin; 2972 CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2973 PetscFunctionReturn(0); 2974 } 2975 2976 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2977 { 2978 PetscFunctionBegin; 2979 CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 2980 PetscFunctionReturn(0); 2981 } 2982 2983 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2984 { 2985 PetscFunctionBegin; 2986 CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2987 PetscFunctionReturn(0); 2988 } 2989 2990 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2991 { 2992 int i = blockIdx.x*blockDim.x + threadIdx.x; 2993 if (i < n) y[idx[i]] += x[i]; 2994 } 2995 2996 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2997 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2998 { 2999 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3000 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3001 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3002 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3003 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3004 PetscBool compressed; 3005 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3006 PetscInt nx,ny; 3007 #endif 3008 3009 PetscFunctionBegin; 3010 PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3011 if (!a->nz) { 3012 if (!yy) CHKERRQ(VecSet_SeqCUDA(zz,0)); 3013 else CHKERRQ(VecCopy_SeqCUDA(yy,zz)); 3014 PetscFunctionReturn(0); 3015 } 3016 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3017 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 3018 if (!trans) { 3019 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3020 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3021 } else { 3022 if (herm || !A->form_explicit_transpose) { 3023 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3024 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3025 } else { 3026 if (!cusparsestruct->matTranspose) CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3027 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3028 } 3029 } 3030 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3031 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3032 3033 try { 3034 CHKERRQ(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3035 if (yy == zz) CHKERRQ(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3036 else CHKERRQ(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3037 3038 CHKERRQ(PetscLogGpuTimeBegin()); 3039 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3040 /* z = A x + beta y. 3041 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3042 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3043 */ 3044 xptr = xarray; 3045 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3046 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3047 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3048 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3049 allocated to accommodate different uses. So we get the length info directly from mat. 3050 */ 3051 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3052 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3053 nx = mat->num_cols; 3054 ny = mat->num_rows; 3055 } 3056 #endif 3057 } else { 3058 /* z = A^T x + beta y 3059 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3060 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3061 */ 3062 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3063 dptr = zarray; 3064 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3065 if (compressed) { /* Scatter x to work vector */ 3066 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3067 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3068 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3069 VecCUDAEqualsReverse()); 3070 } 3071 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3072 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3073 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3074 nx = mat->num_rows; 3075 ny = mat->num_cols; 3076 } 3077 #endif 3078 } 3079 3080 /* csr_spmv does y = alpha op(A) x + beta y */ 3081 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3082 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3083 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3084 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3085 CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3086 CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3087 CHKERRCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3088 matstruct->matDescr, 3089 matstruct->cuSpMV[opA].vecXDescr, beta, 3090 matstruct->cuSpMV[opA].vecYDescr, 3091 cusparse_scalartype, 3092 cusparsestruct->spmvAlg, 3093 &matstruct->cuSpMV[opA].spmvBufferSize)); 3094 CHKERRCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3095 3096 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3097 } else { 3098 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3099 CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3100 CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3101 } 3102 3103 CHKERRCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3104 matstruct->alpha_one, 3105 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3106 matstruct->cuSpMV[opA].vecXDescr, 3107 beta, 3108 matstruct->cuSpMV[opA].vecYDescr, 3109 cusparse_scalartype, 3110 cusparsestruct->spmvAlg, 3111 matstruct->cuSpMV[opA].spmvBuffer)); 3112 #else 3113 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3114 CHKERRCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3115 mat->num_rows, mat->num_cols, 3116 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3117 mat->values->data().get(), mat->row_offsets->data().get(), 3118 mat->column_indices->data().get(), xptr, beta, 3119 dptr)); 3120 #endif 3121 } else { 3122 if (cusparsestruct->nrows) { 3123 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3124 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3125 #else 3126 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3127 CHKERRCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3128 matstruct->alpha_one, matstruct->descr, hybMat, 3129 xptr, beta, 3130 dptr)); 3131 #endif 3132 } 3133 } 3134 CHKERRQ(PetscLogGpuTimeEnd()); 3135 3136 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3137 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3138 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3139 CHKERRQ(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3140 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3141 CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3142 } 3143 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3144 CHKERRQ(VecSet_SeqCUDA(zz,0)); 3145 } 3146 3147 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3148 if (compressed) { 3149 CHKERRQ(PetscLogGpuTimeBegin()); 3150 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3151 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3152 prevent that. So I just add a ScatterAdd kernel. 3153 */ 3154 #if 0 3155 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3156 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3157 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3158 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3159 VecCUDAPlusEquals()); 3160 #else 3161 PetscInt n = matstruct->cprowIndices->size(); 3162 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3163 #endif 3164 CHKERRQ(PetscLogGpuTimeEnd()); 3165 } 3166 } else { 3167 if (yy && yy != zz) { 3168 CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3169 } 3170 } 3171 CHKERRQ(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3172 if (yy == zz) CHKERRQ(VecCUDARestoreArray(zz,&zarray)); 3173 else CHKERRQ(VecCUDARestoreArrayWrite(zz,&zarray)); 3174 } catch(char *ex) { 3175 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3176 } 3177 if (yy) { 3178 CHKERRQ(PetscLogGpuFlops(2.0*a->nz)); 3179 } else { 3180 CHKERRQ(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3181 } 3182 PetscFunctionReturn(0); 3183 } 3184 3185 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3186 { 3187 PetscFunctionBegin; 3188 CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3189 PetscFunctionReturn(0); 3190 } 3191 3192 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3193 { 3194 PetscObjectState onnz = A->nonzerostate; 3195 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3196 3197 PetscFunctionBegin; 3198 CHKERRQ(MatAssemblyEnd_SeqAIJ(A,mode)); 3199 if (onnz != A->nonzerostate && cusp->deviceMat) { 3200 3201 CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3202 CHKERRCUDA(cudaFree(cusp->deviceMat)); 3203 cusp->deviceMat = NULL; 3204 } 3205 PetscFunctionReturn(0); 3206 } 3207 3208 /* --------------------------------------------------------------------------------*/ 3209 /*@ 3210 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3211 (the default parallel PETSc format). This matrix will ultimately pushed down 3212 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3213 assembly performance the user should preallocate the matrix storage by setting 3214 the parameter nz (or the array nnz). By setting these parameters accurately, 3215 performance during matrix assembly can be increased by more than a factor of 50. 3216 3217 Collective 3218 3219 Input Parameters: 3220 + comm - MPI communicator, set to PETSC_COMM_SELF 3221 . m - number of rows 3222 . n - number of columns 3223 . nz - number of nonzeros per row (same for all rows) 3224 - nnz - array containing the number of nonzeros in the various rows 3225 (possibly different for each row) or NULL 3226 3227 Output Parameter: 3228 . A - the matrix 3229 3230 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3231 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3232 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3233 3234 Notes: 3235 If nnz is given then nz is ignored 3236 3237 The AIJ format (also called the Yale sparse matrix format or 3238 compressed row storage), is fully compatible with standard Fortran 77 3239 storage. That is, the stored row and column indices can begin at 3240 either one (as in Fortran) or zero. See the users' manual for details. 3241 3242 Specify the preallocated storage with either nz or nnz (not both). 3243 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3244 allocation. For large problems you MUST preallocate memory or you 3245 will get TERRIBLE performance, see the users' manual chapter on matrices. 3246 3247 By default, this format uses inodes (identical nodes) when possible, to 3248 improve numerical efficiency of matrix-vector products and solves. We 3249 search for consecutive rows with the same nonzero structure, thereby 3250 reusing matrix information to achieve increased efficiency. 3251 3252 Level: intermediate 3253 3254 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3255 @*/ 3256 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3257 { 3258 PetscFunctionBegin; 3259 CHKERRQ(MatCreate(comm,A)); 3260 CHKERRQ(MatSetSizes(*A,m,n,m,n)); 3261 CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE)); 3262 CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3263 PetscFunctionReturn(0); 3264 } 3265 3266 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3267 { 3268 PetscFunctionBegin; 3269 if (A->factortype == MAT_FACTOR_NONE) { 3270 CHKERRQ(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 3271 } else { 3272 CHKERRQ(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3273 } 3274 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3275 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 3276 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 3277 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3278 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3279 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3280 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 3281 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3282 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3283 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 3284 CHKERRQ(MatDestroy_SeqAIJ(A)); 3285 PetscFunctionReturn(0); 3286 } 3287 3288 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3289 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3290 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3291 { 3292 PetscFunctionBegin; 3293 CHKERRQ(MatDuplicate_SeqAIJ(A,cpvalues,B)); 3294 CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 3295 PetscFunctionReturn(0); 3296 } 3297 3298 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3299 { 3300 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3301 Mat_SeqAIJCUSPARSE *cy; 3302 Mat_SeqAIJCUSPARSE *cx; 3303 PetscScalar *ay; 3304 const PetscScalar *ax; 3305 CsrMatrix *csry,*csrx; 3306 3307 PetscFunctionBegin; 3308 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3309 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3310 if (X->ops->axpy != Y->ops->axpy) { 3311 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3312 CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str)); 3313 PetscFunctionReturn(0); 3314 } 3315 /* if we are here, it means both matrices are bound to GPU */ 3316 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(Y)); 3317 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(X)); 3318 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3319 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3320 csry = (CsrMatrix*)cy->mat->mat; 3321 csrx = (CsrMatrix*)cx->mat->mat; 3322 /* see if we can turn this into a cublas axpy */ 3323 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3324 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3325 if (eq) { 3326 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3327 } 3328 if (eq) str = SAME_NONZERO_PATTERN; 3329 } 3330 /* spgeam is buggy with one column */ 3331 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3332 3333 if (str == SUBSET_NONZERO_PATTERN) { 3334 PetscScalar b = 1.0; 3335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3336 size_t bufferSize; 3337 void *buffer; 3338 #endif 3339 3340 CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3341 CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3342 CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3343 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3344 CHKERRCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3345 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3346 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3347 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 3348 CHKERRCUDA(cudaMalloc(&buffer,bufferSize)); 3349 CHKERRQ(PetscLogGpuTimeBegin()); 3350 CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3351 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3352 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3353 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 3354 CHKERRQ(PetscLogGpuFlops(x->nz + y->nz)); 3355 CHKERRQ(PetscLogGpuTimeEnd()); 3356 CHKERRCUDA(cudaFree(buffer)); 3357 #else 3358 CHKERRQ(PetscLogGpuTimeBegin()); 3359 CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3360 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3361 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3362 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 3363 CHKERRQ(PetscLogGpuFlops(x->nz + y->nz)); 3364 CHKERRQ(PetscLogGpuTimeEnd()); 3365 #endif 3366 CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3367 CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3368 CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3369 CHKERRQ(MatSeqAIJInvalidateDiagonal(Y)); 3370 } else if (str == SAME_NONZERO_PATTERN) { 3371 cublasHandle_t cublasv2handle; 3372 PetscBLASInt one = 1, bnz = 1; 3373 3374 CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3375 CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3376 CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle)); 3377 CHKERRQ(PetscBLASIntCast(x->nz,&bnz)); 3378 CHKERRQ(PetscLogGpuTimeBegin()); 3379 CHKERRCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 3380 CHKERRQ(PetscLogGpuFlops(2.0*bnz)); 3381 CHKERRQ(PetscLogGpuTimeEnd()); 3382 CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3383 CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3384 CHKERRQ(MatSeqAIJInvalidateDiagonal(Y)); 3385 } else { 3386 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3387 CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str)); 3388 } 3389 PetscFunctionReturn(0); 3390 } 3391 3392 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3393 { 3394 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3395 PetscScalar *ay; 3396 cublasHandle_t cublasv2handle; 3397 PetscBLASInt one = 1, bnz = 1; 3398 3399 PetscFunctionBegin; 3400 CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3401 CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle)); 3402 CHKERRQ(PetscBLASIntCast(y->nz,&bnz)); 3403 CHKERRQ(PetscLogGpuTimeBegin()); 3404 CHKERRCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 3405 CHKERRQ(PetscLogGpuFlops(bnz)); 3406 CHKERRQ(PetscLogGpuTimeEnd()); 3407 CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3408 CHKERRQ(MatSeqAIJInvalidateDiagonal(Y)); 3409 PetscFunctionReturn(0); 3410 } 3411 3412 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3413 { 3414 PetscBool both = PETSC_FALSE; 3415 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3416 3417 PetscFunctionBegin; 3418 if (A->factortype == MAT_FACTOR_NONE) { 3419 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3420 if (spptr->mat) { 3421 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3422 if (matrix->values) { 3423 both = PETSC_TRUE; 3424 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3425 } 3426 } 3427 if (spptr->matTranspose) { 3428 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3429 if (matrix->values) { 3430 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3431 } 3432 } 3433 } 3434 CHKERRQ(PetscArrayzero(a->a,a->i[A->rmap->n])); 3435 CHKERRQ(MatSeqAIJInvalidateDiagonal(A)); 3436 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3437 else A->offloadmask = PETSC_OFFLOAD_CPU; 3438 PetscFunctionReturn(0); 3439 } 3440 3441 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3442 { 3443 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3444 3445 PetscFunctionBegin; 3446 if (A->factortype != MAT_FACTOR_NONE) { 3447 A->boundtocpu = flg; 3448 PetscFunctionReturn(0); 3449 } 3450 if (flg) { 3451 CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A)); 3452 3453 A->ops->scale = MatScale_SeqAIJ; 3454 A->ops->axpy = MatAXPY_SeqAIJ; 3455 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3456 A->ops->mult = MatMult_SeqAIJ; 3457 A->ops->multadd = MatMultAdd_SeqAIJ; 3458 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3459 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3460 A->ops->multhermitiantranspose = NULL; 3461 A->ops->multhermitiantransposeadd = NULL; 3462 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3463 CHKERRQ(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 3464 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3465 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3466 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3467 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3468 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3469 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ)); 3470 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3471 } else { 3472 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3473 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3474 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3475 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3476 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3477 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3478 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3479 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3480 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3481 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3482 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3483 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3484 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3485 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3486 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3487 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3488 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3489 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3490 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3491 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3492 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 3493 CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3494 } 3495 A->boundtocpu = flg; 3496 if (flg && a->inode.size) { 3497 a->inode.use = PETSC_TRUE; 3498 } else { 3499 a->inode.use = PETSC_FALSE; 3500 } 3501 PetscFunctionReturn(0); 3502 } 3503 3504 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3505 { 3506 Mat B; 3507 3508 PetscFunctionBegin; 3509 CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3510 if (reuse == MAT_INITIAL_MATRIX) { 3511 CHKERRQ(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 3512 } else if (reuse == MAT_REUSE_MATRIX) { 3513 CHKERRQ(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 3514 } 3515 B = *newmat; 3516 3517 CHKERRQ(PetscFree(B->defaultvectype)); 3518 CHKERRQ(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 3519 3520 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3521 if (B->factortype == MAT_FACTOR_NONE) { 3522 Mat_SeqAIJCUSPARSE *spptr; 3523 CHKERRQ(PetscNew(&spptr)); 3524 CHKERRCUSPARSE(cusparseCreate(&spptr->handle)); 3525 CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3526 spptr->format = MAT_CUSPARSE_CSR; 3527 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3528 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0) 3529 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3530 #else 3531 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3532 #endif 3533 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3534 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3535 #endif 3536 B->spptr = spptr; 3537 } else { 3538 Mat_SeqAIJCUSPARSETriFactors *spptr; 3539 3540 CHKERRQ(PetscNew(&spptr)); 3541 CHKERRCUSPARSE(cusparseCreate(&spptr->handle)); 3542 CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3543 B->spptr = spptr; 3544 } 3545 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3546 } 3547 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3548 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3549 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3550 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3551 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3552 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3553 3554 CHKERRQ(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 3555 CHKERRQ(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 3556 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3557 #if defined(PETSC_HAVE_HYPRE) 3558 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3559 #endif 3560 CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3561 PetscFunctionReturn(0); 3562 } 3563 3564 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3565 { 3566 PetscFunctionBegin; 3567 CHKERRQ(MatCreate_SeqAIJ(B)); 3568 CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 3569 PetscFunctionReturn(0); 3570 } 3571 3572 /*MC 3573 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3574 3575 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3576 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3577 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3578 3579 Options Database Keys: 3580 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3581 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3582 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3583 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3584 3585 Level: beginner 3586 3587 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3588 M*/ 3589 3590 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3591 3592 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3593 { 3594 PetscFunctionBegin; 3595 CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 3596 CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 3597 CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 3598 CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 3599 CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3600 3601 PetscFunctionReturn(0); 3602 } 3603 3604 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3605 { 3606 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3607 cudaError_t cerr; 3608 3609 PetscFunctionBegin; 3610 if (!cusp) PetscFunctionReturn(0); 3611 delete cusp->cooPerm; 3612 delete cusp->cooPerm_a; 3613 cusp->cooPerm = NULL; 3614 cusp->cooPerm_a = NULL; 3615 if (cusp->use_extended_coo) { 3616 CHKERRCUDA(cudaFree(cusp->jmap_d)); 3617 CHKERRCUDA(cudaFree(cusp->perm_d)); 3618 } 3619 cusp->use_extended_coo = PETSC_FALSE; 3620 PetscFunctionReturn(0); 3621 } 3622 3623 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3624 { 3625 PetscFunctionBegin; 3626 if (*cusparsestruct) { 3627 CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 3628 CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 3629 delete (*cusparsestruct)->workVector; 3630 delete (*cusparsestruct)->rowoffsets_gpu; 3631 delete (*cusparsestruct)->cooPerm; 3632 delete (*cusparsestruct)->cooPerm_a; 3633 delete (*cusparsestruct)->csr2csc_i; 3634 if ((*cusparsestruct)->handle) CHKERRCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3635 if ((*cusparsestruct)->jmap_d) CHKERRCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3636 if ((*cusparsestruct)->perm_d) CHKERRCUDA(cudaFree((*cusparsestruct)->perm_d)); 3637 CHKERRQ(PetscFree(*cusparsestruct)); 3638 } 3639 PetscFunctionReturn(0); 3640 } 3641 3642 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3643 { 3644 PetscFunctionBegin; 3645 if (*mat) { 3646 delete (*mat)->values; 3647 delete (*mat)->column_indices; 3648 delete (*mat)->row_offsets; 3649 delete *mat; 3650 *mat = 0; 3651 } 3652 PetscFunctionReturn(0); 3653 } 3654 3655 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3656 { 3657 PetscFunctionBegin; 3658 if (*trifactor) { 3659 if ((*trifactor)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3660 if ((*trifactor)->solveInfo) CHKERRCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 3661 CHKERRQ(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3662 if ((*trifactor)->solveBuffer) CHKERRCUDA(cudaFree((*trifactor)->solveBuffer)); 3663 if ((*trifactor)->AA_h) CHKERRCUDA(cudaFreeHost((*trifactor)->AA_h)); 3664 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3665 if ((*trifactor)->csr2cscBuffer) CHKERRCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3666 #endif 3667 CHKERRQ(PetscFree(*trifactor)); 3668 } 3669 PetscFunctionReturn(0); 3670 } 3671 3672 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3673 { 3674 CsrMatrix *mat; 3675 3676 PetscFunctionBegin; 3677 if (*matstruct) { 3678 if ((*matstruct)->mat) { 3679 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3680 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3681 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3682 #else 3683 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3684 CHKERRCUSPARSE(cusparseDestroyHybMat(hybMat)); 3685 #endif 3686 } else { 3687 mat = (CsrMatrix*)(*matstruct)->mat; 3688 CsrMatrix_Destroy(&mat); 3689 } 3690 } 3691 if ((*matstruct)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3692 delete (*matstruct)->cprowIndices; 3693 if ((*matstruct)->alpha_one) CHKERRCUDA(cudaFree((*matstruct)->alpha_one)); 3694 if ((*matstruct)->beta_zero) CHKERRCUDA(cudaFree((*matstruct)->beta_zero)); 3695 if ((*matstruct)->beta_one) CHKERRCUDA(cudaFree((*matstruct)->beta_one)); 3696 3697 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3698 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3699 if (mdata->matDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3700 for (int i=0; i<3; i++) { 3701 if (mdata->cuSpMV[i].initialized) { 3702 CHKERRCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3703 CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3704 CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3705 } 3706 } 3707 #endif 3708 delete *matstruct; 3709 *matstruct = NULL; 3710 } 3711 PetscFunctionReturn(0); 3712 } 3713 3714 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3715 { 3716 PetscFunctionBegin; 3717 if (*trifactors) { 3718 CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 3719 CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 3720 CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 3721 CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 3722 delete (*trifactors)->rpermIndices; 3723 delete (*trifactors)->cpermIndices; 3724 delete (*trifactors)->workVector; 3725 (*trifactors)->rpermIndices = NULL; 3726 (*trifactors)->cpermIndices = NULL; 3727 (*trifactors)->workVector = NULL; 3728 if ((*trifactors)->a_band_d) CHKERRCUDA(cudaFree((*trifactors)->a_band_d)); 3729 if ((*trifactors)->i_band_d) CHKERRCUDA(cudaFree((*trifactors)->i_band_d)); 3730 (*trifactors)->init_dev_prop = PETSC_FALSE; 3731 } 3732 PetscFunctionReturn(0); 3733 } 3734 3735 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3736 { 3737 cusparseHandle_t handle; 3738 3739 PetscFunctionBegin; 3740 if (*trifactors) { 3741 CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3742 if (handle = (*trifactors)->handle) { 3743 CHKERRCUSPARSE(cusparseDestroy(handle)); 3744 } 3745 CHKERRQ(PetscFree(*trifactors)); 3746 } 3747 PetscFunctionReturn(0); 3748 } 3749 3750 struct IJCompare 3751 { 3752 __host__ __device__ 3753 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3754 { 3755 if (t1.get<0>() < t2.get<0>()) return true; 3756 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3757 return false; 3758 } 3759 }; 3760 3761 struct IJEqual 3762 { 3763 __host__ __device__ 3764 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3765 { 3766 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3767 return true; 3768 } 3769 }; 3770 3771 struct IJDiff 3772 { 3773 __host__ __device__ 3774 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3775 { 3776 return t1 == t2 ? 0 : 1; 3777 } 3778 }; 3779 3780 struct IJSum 3781 { 3782 __host__ __device__ 3783 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3784 { 3785 return t1||t2; 3786 } 3787 }; 3788 3789 #include <thrust/iterator/discard_iterator.h> 3790 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3791 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3792 { 3793 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3794 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3795 THRUSTARRAY *cooPerm_v = NULL; 3796 thrust::device_ptr<const PetscScalar> d_v; 3797 CsrMatrix *matrix; 3798 PetscInt n; 3799 3800 PetscFunctionBegin; 3801 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3802 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3803 if (!cusp->cooPerm) { 3804 CHKERRQ(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 3805 CHKERRQ(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 3806 PetscFunctionReturn(0); 3807 } 3808 matrix = (CsrMatrix*)cusp->mat->mat; 3809 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3810 if (!v) { 3811 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3812 goto finalize; 3813 } 3814 n = cusp->cooPerm->size(); 3815 if (isCudaMem(v)) { 3816 d_v = thrust::device_pointer_cast(v); 3817 } else { 3818 cooPerm_v = new THRUSTARRAY(n); 3819 cooPerm_v->assign(v,v+n); 3820 d_v = cooPerm_v->data(); 3821 CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 3822 } 3823 CHKERRQ(PetscLogGpuTimeBegin()); 3824 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3825 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3826 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3827 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3828 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3829 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3830 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3831 */ 3832 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3833 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3834 delete cooPerm_w; 3835 } else { 3836 /* all nonzeros in d_v[] are unique entries */ 3837 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3838 matrix->values->begin())); 3839 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3840 matrix->values->end())); 3841 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3842 } 3843 } else { 3844 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3845 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3846 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3847 } else { 3848 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3849 matrix->values->begin())); 3850 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3851 matrix->values->end())); 3852 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3853 } 3854 } 3855 CHKERRQ(PetscLogGpuTimeEnd()); 3856 finalize: 3857 delete cooPerm_v; 3858 A->offloadmask = PETSC_OFFLOAD_GPU; 3859 CHKERRQ(PetscObjectStateIncrease((PetscObject)A)); 3860 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3861 CHKERRQ(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 3862 CHKERRQ(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 3863 CHKERRQ(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3864 a->reallocs = 0; 3865 A->info.mallocs += 0; 3866 A->info.nz_unneeded = 0; 3867 A->assembled = A->was_assembled = PETSC_TRUE; 3868 A->num_ass++; 3869 PetscFunctionReturn(0); 3870 } 3871 3872 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3873 { 3874 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3875 3876 PetscFunctionBegin; 3877 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3878 if (!cusp) PetscFunctionReturn(0); 3879 if (destroy) { 3880 CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3881 delete cusp->csr2csc_i; 3882 cusp->csr2csc_i = NULL; 3883 } 3884 A->transupdated = PETSC_FALSE; 3885 PetscFunctionReturn(0); 3886 } 3887 3888 #include <thrust/binary_search.h> 3889 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3890 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3891 { 3892 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3893 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3894 PetscInt cooPerm_n, nzr = 0; 3895 3896 PetscFunctionBegin; 3897 CHKERRQ(PetscLayoutSetUp(A->rmap)); 3898 CHKERRQ(PetscLayoutSetUp(A->cmap)); 3899 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3900 if (n != cooPerm_n) { 3901 delete cusp->cooPerm; 3902 delete cusp->cooPerm_a; 3903 cusp->cooPerm = NULL; 3904 cusp->cooPerm_a = NULL; 3905 } 3906 if (n) { 3907 THRUSTINTARRAY d_i(n); 3908 THRUSTINTARRAY d_j(n); 3909 THRUSTINTARRAY ii(A->rmap->n); 3910 3911 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3912 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3913 3914 CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 3915 d_i.assign(coo_i,coo_i+n); 3916 d_j.assign(coo_j,coo_j+n); 3917 3918 /* Ex. 3919 n = 6 3920 coo_i = [3,3,1,4,1,4] 3921 coo_j = [3,2,2,5,2,6] 3922 */ 3923 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3924 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3925 3926 CHKERRQ(PetscLogGpuTimeBegin()); 3927 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3928 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3929 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3930 THRUSTINTARRAY w = d_j; 3931 3932 /* 3933 d_i = [1,1,3,3,4,4] 3934 d_j = [2,2,2,3,5,6] 3935 cooPerm = [2,4,1,0,3,5] 3936 */ 3937 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3938 3939 /* 3940 d_i = [1,3,3,4,4,x] 3941 ^ekey 3942 d_j = [2,2,3,5,6,x] 3943 ^nekye 3944 */ 3945 if (nekey == ekey) { /* all entries are unique */ 3946 delete cusp->cooPerm_a; 3947 cusp->cooPerm_a = NULL; 3948 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3949 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3950 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3951 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3952 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3953 w[0] = 0; 3954 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3955 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3956 } 3957 thrust::counting_iterator<PetscInt> search_begin(0); 3958 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3959 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3960 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3961 CHKERRQ(PetscLogGpuTimeEnd()); 3962 3963 CHKERRQ(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 3964 a->singlemalloc = PETSC_FALSE; 3965 a->free_a = PETSC_TRUE; 3966 a->free_ij = PETSC_TRUE; 3967 CHKERRQ(PetscMalloc1(A->rmap->n+1,&a->i)); 3968 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3969 CHKERRCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3970 a->nz = a->maxnz = a->i[A->rmap->n]; 3971 a->rmax = 0; 3972 CHKERRQ(PetscMalloc1(a->nz,&a->a)); 3973 CHKERRQ(PetscMalloc1(a->nz,&a->j)); 3974 CHKERRCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3975 if (!a->ilen) CHKERRQ(PetscMalloc1(A->rmap->n,&a->ilen)); 3976 if (!a->imax) CHKERRQ(PetscMalloc1(A->rmap->n,&a->imax)); 3977 for (PetscInt i = 0; i < A->rmap->n; i++) { 3978 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3979 nzr += (PetscInt)!!(nnzr); 3980 a->ilen[i] = a->imax[i] = nnzr; 3981 a->rmax = PetscMax(a->rmax,nnzr); 3982 } 3983 a->nonzerorowcnt = nzr; 3984 A->preallocated = PETSC_TRUE; 3985 CHKERRQ(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 3986 CHKERRQ(MatMarkDiagonal_SeqAIJ(A)); 3987 } else { 3988 CHKERRQ(MatSeqAIJSetPreallocation(A,0,NULL)); 3989 } 3990 CHKERRQ(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 3991 3992 /* We want to allocate the CUSPARSE struct for matvec now. 3993 The code is so convoluted now that I prefer to copy zeros */ 3994 CHKERRQ(PetscArrayzero(a->a,a->nz)); 3995 CHKERRQ(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 3996 A->offloadmask = PETSC_OFFLOAD_CPU; 3997 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 3998 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 3999 PetscFunctionReturn(0); 4000 } 4001 4002 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4003 { 4004 Mat_SeqAIJ *seq; 4005 Mat_SeqAIJCUSPARSE *dev; 4006 PetscBool coo_basic = PETSC_TRUE; 4007 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4008 4009 PetscFunctionBegin; 4010 CHKERRQ(MatResetPreallocationCOO_SeqAIJ(mat)); 4011 CHKERRQ(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4012 if (coo_i) { 4013 CHKERRQ(PetscGetMemType(coo_i,&mtype)); 4014 if (PetscMemTypeHost(mtype)) { 4015 for (PetscCount k=0; k<coo_n; k++) { 4016 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4017 } 4018 } 4019 } 4020 4021 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4022 CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4023 } else { 4024 CHKERRQ(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4025 mat->offloadmask = PETSC_OFFLOAD_CPU; 4026 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mat)); 4027 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4028 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4029 CHKERRCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4030 CHKERRCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4031 CHKERRCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4032 CHKERRCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4033 dev->use_extended_coo = PETSC_TRUE; 4034 } 4035 PetscFunctionReturn(0); 4036 } 4037 4038 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4039 { 4040 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4041 const PetscCount grid_size = gridDim.x * blockDim.x; 4042 for (; i<nnz; i+= grid_size) { 4043 PetscScalar sum = 0.0; 4044 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4045 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4046 } 4047 } 4048 4049 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4050 { 4051 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4052 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4053 PetscCount Annz = seq->nz; 4054 PetscMemType memtype; 4055 const PetscScalar *v1 = v; 4056 PetscScalar *Aa; 4057 4058 PetscFunctionBegin; 4059 if (dev->use_extended_coo) { 4060 CHKERRQ(PetscGetMemType(v,&memtype)); 4061 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4062 CHKERRCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4063 CHKERRCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4064 } 4065 4066 if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4067 else CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4068 4069 if (Annz) { 4070 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4071 CHKERRCUDA(cudaPeekAtLastError()); 4072 } 4073 4074 if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4075 else CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4076 4077 if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1)); 4078 } else { 4079 CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4080 } 4081 PetscFunctionReturn(0); 4082 } 4083 4084 /*@C 4085 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4086 4087 Not collective 4088 4089 Input Parameters: 4090 + A - the matrix 4091 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4092 4093 Output Parameters: 4094 + ia - the CSR row pointers 4095 - ja - the CSR column indices 4096 4097 Level: developer 4098 4099 Notes: 4100 When compressed is true, the CSR structure does not contain empty rows 4101 4102 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4103 @*/ 4104 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4105 { 4106 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4107 CsrMatrix *csr; 4108 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4109 4110 PetscFunctionBegin; 4111 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4112 if (!i || !j) PetscFunctionReturn(0); 4113 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4114 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4115 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 4116 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4117 csr = (CsrMatrix*)cusp->mat->mat; 4118 if (i) { 4119 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4120 if (!cusp->rowoffsets_gpu) { 4121 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4122 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4123 CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4124 } 4125 *i = cusp->rowoffsets_gpu->data().get(); 4126 } else *i = csr->row_offsets->data().get(); 4127 } 4128 if (j) *j = csr->column_indices->data().get(); 4129 PetscFunctionReturn(0); 4130 } 4131 4132 /*@C 4133 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4134 4135 Not collective 4136 4137 Input Parameters: 4138 + A - the matrix 4139 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4140 4141 Output Parameters: 4142 + ia - the CSR row pointers 4143 - ja - the CSR column indices 4144 4145 Level: developer 4146 4147 .seealso: MatSeqAIJCUSPARSEGetIJ() 4148 @*/ 4149 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4150 { 4151 PetscFunctionBegin; 4152 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4153 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4154 if (i) *i = NULL; 4155 if (j) *j = NULL; 4156 PetscFunctionReturn(0); 4157 } 4158 4159 /*@C 4160 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4161 4162 Not Collective 4163 4164 Input Parameter: 4165 . A - a MATSEQAIJCUSPARSE matrix 4166 4167 Output Parameter: 4168 . a - pointer to the device data 4169 4170 Level: developer 4171 4172 Notes: may trigger host-device copies if up-to-date matrix data is on host 4173 4174 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4175 @*/ 4176 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4177 { 4178 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4179 CsrMatrix *csr; 4180 4181 PetscFunctionBegin; 4182 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4183 PetscValidPointer(a,2); 4184 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4185 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4186 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 4187 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4188 csr = (CsrMatrix*)cusp->mat->mat; 4189 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4190 *a = csr->values->data().get(); 4191 PetscFunctionReturn(0); 4192 } 4193 4194 /*@C 4195 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4196 4197 Not Collective 4198 4199 Input Parameter: 4200 . A - a MATSEQAIJCUSPARSE matrix 4201 4202 Output Parameter: 4203 . a - pointer to the device data 4204 4205 Level: developer 4206 4207 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4208 @*/ 4209 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4210 { 4211 PetscFunctionBegin; 4212 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4213 PetscValidPointer(a,2); 4214 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4215 *a = NULL; 4216 PetscFunctionReturn(0); 4217 } 4218 4219 /*@C 4220 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4221 4222 Not Collective 4223 4224 Input Parameter: 4225 . A - a MATSEQAIJCUSPARSE matrix 4226 4227 Output Parameter: 4228 . a - pointer to the device data 4229 4230 Level: developer 4231 4232 Notes: may trigger host-device copies if up-to-date matrix data is on host 4233 4234 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4235 @*/ 4236 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4237 { 4238 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4239 CsrMatrix *csr; 4240 4241 PetscFunctionBegin; 4242 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4243 PetscValidPointer(a,2); 4244 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4245 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4246 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 4247 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4248 csr = (CsrMatrix*)cusp->mat->mat; 4249 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4250 *a = csr->values->data().get(); 4251 A->offloadmask = PETSC_OFFLOAD_GPU; 4252 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4253 PetscFunctionReturn(0); 4254 } 4255 /*@C 4256 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4257 4258 Not Collective 4259 4260 Input Parameter: 4261 . A - a MATSEQAIJCUSPARSE matrix 4262 4263 Output Parameter: 4264 . a - pointer to the device data 4265 4266 Level: developer 4267 4268 .seealso: MatSeqAIJCUSPARSEGetArray() 4269 @*/ 4270 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4271 { 4272 PetscFunctionBegin; 4273 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4274 PetscValidPointer(a,2); 4275 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4276 CHKERRQ(MatSeqAIJInvalidateDiagonal(A)); 4277 CHKERRQ(PetscObjectStateIncrease((PetscObject)A)); 4278 *a = NULL; 4279 PetscFunctionReturn(0); 4280 } 4281 4282 /*@C 4283 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4284 4285 Not Collective 4286 4287 Input Parameter: 4288 . A - a MATSEQAIJCUSPARSE matrix 4289 4290 Output Parameter: 4291 . a - pointer to the device data 4292 4293 Level: developer 4294 4295 Notes: does not trigger host-device copies and flags data validity on the GPU 4296 4297 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4298 @*/ 4299 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4300 { 4301 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4302 CsrMatrix *csr; 4303 4304 PetscFunctionBegin; 4305 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4306 PetscValidPointer(a,2); 4307 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4308 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4309 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4310 csr = (CsrMatrix*)cusp->mat->mat; 4311 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4312 *a = csr->values->data().get(); 4313 A->offloadmask = PETSC_OFFLOAD_GPU; 4314 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4315 PetscFunctionReturn(0); 4316 } 4317 4318 /*@C 4319 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4320 4321 Not Collective 4322 4323 Input Parameter: 4324 . A - a MATSEQAIJCUSPARSE matrix 4325 4326 Output Parameter: 4327 . a - pointer to the device data 4328 4329 Level: developer 4330 4331 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4332 @*/ 4333 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4334 { 4335 PetscFunctionBegin; 4336 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4337 PetscValidPointer(a,2); 4338 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4339 CHKERRQ(MatSeqAIJInvalidateDiagonal(A)); 4340 CHKERRQ(PetscObjectStateIncrease((PetscObject)A)); 4341 *a = NULL; 4342 PetscFunctionReturn(0); 4343 } 4344 4345 struct IJCompare4 4346 { 4347 __host__ __device__ 4348 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4349 { 4350 if (t1.get<0>() < t2.get<0>()) return true; 4351 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4352 return false; 4353 } 4354 }; 4355 4356 struct Shift 4357 { 4358 int _shift; 4359 4360 Shift(int shift) : _shift(shift) {} 4361 __host__ __device__ 4362 inline int operator() (const int &c) 4363 { 4364 return c + _shift; 4365 } 4366 }; 4367 4368 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4369 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4370 { 4371 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4372 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4373 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4374 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4375 PetscInt Annz,Bnnz; 4376 cusparseStatus_t stat; 4377 PetscInt i,m,n,zero = 0; 4378 4379 PetscFunctionBegin; 4380 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4381 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4382 PetscValidPointer(C,4); 4383 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4384 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4385 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4386 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4387 PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4388 PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4389 if (reuse == MAT_INITIAL_MATRIX) { 4390 m = A->rmap->n; 4391 n = A->cmap->n + B->cmap->n; 4392 CHKERRQ(MatCreate(PETSC_COMM_SELF,C)); 4393 CHKERRQ(MatSetSizes(*C,m,n,m,n)); 4394 CHKERRQ(MatSetType(*C,MATSEQAIJCUSPARSE)); 4395 c = (Mat_SeqAIJ*)(*C)->data; 4396 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4397 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4398 Ccsr = new CsrMatrix; 4399 Cmat->cprowIndices = NULL; 4400 c->compressedrow.use = PETSC_FALSE; 4401 c->compressedrow.nrows = 0; 4402 c->compressedrow.i = NULL; 4403 c->compressedrow.rindex = NULL; 4404 Ccusp->workVector = NULL; 4405 Ccusp->nrows = m; 4406 Ccusp->mat = Cmat; 4407 Ccusp->mat->mat = Ccsr; 4408 Ccsr->num_rows = m; 4409 Ccsr->num_cols = n; 4410 CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4411 CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4412 CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4413 CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 4414 CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 4415 CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4416 CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4417 CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4418 CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4419 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 4420 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B)); 4421 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4422 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4423 4424 Acsr = (CsrMatrix*)Acusp->mat->mat; 4425 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4426 Annz = (PetscInt)Acsr->column_indices->size(); 4427 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4428 c->nz = Annz + Bnnz; 4429 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4430 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4431 Ccsr->values = new THRUSTARRAY(c->nz); 4432 Ccsr->num_entries = c->nz; 4433 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4434 if (c->nz) { 4435 auto Acoo = new THRUSTINTARRAY32(Annz); 4436 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4437 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4438 THRUSTINTARRAY32 *Aroff,*Broff; 4439 4440 if (a->compressedrow.use) { /* need full row offset */ 4441 if (!Acusp->rowoffsets_gpu) { 4442 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4443 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4444 CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4445 } 4446 Aroff = Acusp->rowoffsets_gpu; 4447 } else Aroff = Acsr->row_offsets; 4448 if (b->compressedrow.use) { /* need full row offset */ 4449 if (!Bcusp->rowoffsets_gpu) { 4450 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4451 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4452 CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4453 } 4454 Broff = Bcusp->rowoffsets_gpu; 4455 } else Broff = Bcsr->row_offsets; 4456 CHKERRQ(PetscLogGpuTimeBegin()); 4457 stat = cusparseXcsr2coo(Acusp->handle, 4458 Aroff->data().get(), 4459 Annz, 4460 m, 4461 Acoo->data().get(), 4462 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4463 stat = cusparseXcsr2coo(Bcusp->handle, 4464 Broff->data().get(), 4465 Bnnz, 4466 m, 4467 Bcoo->data().get(), 4468 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4469 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4470 auto Aperm = thrust::make_constant_iterator(1); 4471 auto Bperm = thrust::make_constant_iterator(0); 4472 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4473 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4474 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4475 #else 4476 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4477 auto Bcib = Bcsr->column_indices->begin(); 4478 auto Bcie = Bcsr->column_indices->end(); 4479 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4480 #endif 4481 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4482 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4483 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4484 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4485 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4486 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4487 auto p1 = Ccusp->cooPerm->begin(); 4488 auto p2 = Ccusp->cooPerm->begin(); 4489 thrust::advance(p2,Annz); 4490 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4491 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4492 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4493 #endif 4494 auto cci = thrust::make_counting_iterator(zero); 4495 auto cce = thrust::make_counting_iterator(c->nz); 4496 #if 0 //Errors on SUMMIT cuda 11.1.0 4497 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4498 #else 4499 auto pred = thrust::identity<int>(); 4500 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4501 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4502 #endif 4503 stat = cusparseXcoo2csr(Ccusp->handle, 4504 Ccoo->data().get(), 4505 c->nz, 4506 m, 4507 Ccsr->row_offsets->data().get(), 4508 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4509 CHKERRQ(PetscLogGpuTimeEnd()); 4510 delete wPerm; 4511 delete Acoo; 4512 delete Bcoo; 4513 delete Ccoo; 4514 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4515 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4516 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4517 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4518 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4519 #endif 4520 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4521 CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4522 CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4523 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4524 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4525 CsrMatrix *CcsrT = new CsrMatrix; 4526 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4527 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4528 4529 (*C)->form_explicit_transpose = PETSC_TRUE; 4530 (*C)->transupdated = PETSC_TRUE; 4531 Ccusp->rowoffsets_gpu = NULL; 4532 CmatT->cprowIndices = NULL; 4533 CmatT->mat = CcsrT; 4534 CcsrT->num_rows = n; 4535 CcsrT->num_cols = m; 4536 CcsrT->num_entries = c->nz; 4537 4538 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4539 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4540 CcsrT->values = new THRUSTARRAY(c->nz); 4541 4542 CHKERRQ(PetscLogGpuTimeBegin()); 4543 auto rT = CcsrT->row_offsets->begin(); 4544 if (AT) { 4545 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4546 thrust::advance(rT,-1); 4547 } 4548 if (BT) { 4549 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4550 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4551 thrust::copy(titb,tite,rT); 4552 } 4553 auto cT = CcsrT->column_indices->begin(); 4554 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4555 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4556 auto vT = CcsrT->values->begin(); 4557 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4558 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4559 CHKERRQ(PetscLogGpuTimeEnd()); 4560 4561 CHKERRCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4562 CHKERRCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4563 CHKERRCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4564 CHKERRCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 4565 CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 4566 CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4567 CHKERRCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4568 CHKERRCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4569 CHKERRCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4570 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4571 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4572 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4573 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4574 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4575 #endif 4576 Ccusp->matTranspose = CmatT; 4577 } 4578 } 4579 4580 c->singlemalloc = PETSC_FALSE; 4581 c->free_a = PETSC_TRUE; 4582 c->free_ij = PETSC_TRUE; 4583 CHKERRQ(PetscMalloc1(m+1,&c->i)); 4584 CHKERRQ(PetscMalloc1(c->nz,&c->j)); 4585 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4586 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4587 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4588 ii = *Ccsr->row_offsets; 4589 jj = *Ccsr->column_indices; 4590 CHKERRCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4591 CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4592 } else { 4593 CHKERRCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4594 CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4595 } 4596 CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 4597 CHKERRQ(PetscMalloc1(m,&c->ilen)); 4598 CHKERRQ(PetscMalloc1(m,&c->imax)); 4599 c->maxnz = c->nz; 4600 c->nonzerorowcnt = 0; 4601 c->rmax = 0; 4602 for (i = 0; i < m; i++) { 4603 const PetscInt nn = c->i[i+1] - c->i[i]; 4604 c->ilen[i] = c->imax[i] = nn; 4605 c->nonzerorowcnt += (PetscInt)!!nn; 4606 c->rmax = PetscMax(c->rmax,nn); 4607 } 4608 CHKERRQ(MatMarkDiagonal_SeqAIJ(*C)); 4609 CHKERRQ(PetscMalloc1(c->nz,&c->a)); 4610 (*C)->nonzerostate++; 4611 CHKERRQ(PetscLayoutSetUp((*C)->rmap)); 4612 CHKERRQ(PetscLayoutSetUp((*C)->cmap)); 4613 Ccusp->nonzerostate = (*C)->nonzerostate; 4614 (*C)->preallocated = PETSC_TRUE; 4615 } else { 4616 PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4617 c = (Mat_SeqAIJ*)(*C)->data; 4618 if (c->nz) { 4619 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4620 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4621 PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4622 PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4623 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 4624 CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B)); 4625 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4626 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4627 Acsr = (CsrMatrix*)Acusp->mat->mat; 4628 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4629 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4630 PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4631 PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4632 PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4633 PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4634 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4635 auto pmid = Ccusp->cooPerm->begin(); 4636 thrust::advance(pmid,Acsr->num_entries); 4637 CHKERRQ(PetscLogGpuTimeBegin()); 4638 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4639 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4640 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4641 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4642 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4643 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4644 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4645 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4646 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4647 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4648 CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 4649 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4650 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4651 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4652 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4653 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4654 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4655 auto vT = CcsrT->values->begin(); 4656 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4657 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4658 (*C)->transupdated = PETSC_TRUE; 4659 } 4660 CHKERRQ(PetscLogGpuTimeEnd()); 4661 } 4662 } 4663 CHKERRQ(PetscObjectStateIncrease((PetscObject)*C)); 4664 (*C)->assembled = PETSC_TRUE; 4665 (*C)->was_assembled = PETSC_FALSE; 4666 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4667 PetscFunctionReturn(0); 4668 } 4669 4670 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4671 { 4672 bool dmem; 4673 const PetscScalar *av; 4674 4675 PetscFunctionBegin; 4676 dmem = isCudaMem(v); 4677 CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4678 if (n && idx) { 4679 THRUSTINTARRAY widx(n); 4680 widx.assign(idx,idx+n); 4681 CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4682 4683 THRUSTARRAY *w = NULL; 4684 thrust::device_ptr<PetscScalar> dv; 4685 if (dmem) { 4686 dv = thrust::device_pointer_cast(v); 4687 } else { 4688 w = new THRUSTARRAY(n); 4689 dv = w->data(); 4690 } 4691 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4692 4693 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4694 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4695 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4696 if (w) { 4697 CHKERRCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4698 } 4699 delete w; 4700 } else { 4701 CHKERRCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4702 } 4703 if (!dmem) CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4704 CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4705 PetscFunctionReturn(0); 4706 } 4707