1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 96 { 97 cusparseStatus_t stat; 98 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 99 100 PetscFunctionBegin; 101 PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 102 cusparsestruct->stream = stream; 103 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 104 PetscFunctionReturn(0); 105 } 106 107 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 108 { 109 cusparseStatus_t stat; 110 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 111 112 PetscFunctionBegin; 113 PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 114 if (cusparsestruct->handle != handle) { 115 if (cusparsestruct->handle) { 116 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 117 } 118 cusparsestruct->handle = handle; 119 } 120 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 121 PetscFunctionReturn(0); 122 } 123 124 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 PetscBool flg; 128 PetscErrorCode ierr; 129 130 PetscFunctionBegin; 131 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 132 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 133 if (cusparsestruct->handle) cusparsestruct->handle = 0; 134 PetscFunctionReturn(0); 135 } 136 137 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 138 { 139 PetscFunctionBegin; 140 *type = MATSOLVERCUSPARSE; 141 PetscFunctionReturn(0); 142 } 143 144 /*MC 145 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 146 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 147 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 148 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 149 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 150 algorithms are not recommended. This class does NOT support direct solver operations. 151 152 Level: beginner 153 154 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 155 M*/ 156 157 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 158 { 159 PetscErrorCode ierr; 160 PetscInt n = A->rmap->n; 161 162 PetscFunctionBegin; 163 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 164 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 165 (*B)->factortype = ftype; 166 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 167 168 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 169 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 170 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 171 if (!A->boundtocpu) { 172 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 173 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 174 } else { 175 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 176 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 177 } 178 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 179 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 180 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 181 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 182 if (!A->boundtocpu) { 183 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 184 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 185 } else { 186 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 187 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 188 } 189 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 190 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 191 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 192 193 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 194 (*B)->canuseordering = PETSC_TRUE; 195 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 200 { 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202 203 PetscFunctionBegin; 204 switch (op) { 205 case MAT_CUSPARSE_MULT: 206 cusparsestruct->format = format; 207 break; 208 case MAT_CUSPARSE_ALL: 209 cusparsestruct->format = format; 210 break; 211 default: 212 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 213 } 214 PetscFunctionReturn(0); 215 } 216 217 /*@ 218 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 219 operation. Only the MatMult operation can use different GPU storage formats 220 for MPIAIJCUSPARSE matrices. 221 Not Collective 222 223 Input Parameters: 224 + A - Matrix of type SEQAIJCUSPARSE 225 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 226 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 227 228 Output Parameter: 229 230 Level: intermediate 231 232 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 233 @*/ 234 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 235 { 236 PetscErrorCode ierr; 237 238 PetscFunctionBegin; 239 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 240 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 245 { 246 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 247 248 PetscFunctionBegin; 249 cusparsestruct->use_cpu_solve = use_cpu; 250 PetscFunctionReturn(0); 251 } 252 253 /*@ 254 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 255 256 Input Parameters: 257 + A - Matrix of type SEQAIJCUSPARSE 258 - use_cpu - set flag for using the built-in CPU MatSolve 259 260 Output Parameter: 261 262 Notes: 263 The cuSparse LU solver currently computes the factors with the built-in CPU method 264 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 265 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 266 267 Level: intermediate 268 269 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 270 @*/ 271 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 272 { 273 PetscErrorCode ierr; 274 275 PetscFunctionBegin; 276 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 277 ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 282 { 283 PetscErrorCode ierr; 284 285 PetscFunctionBegin; 286 switch (op) { 287 case MAT_FORM_EXPLICIT_TRANSPOSE: 288 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 289 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 290 A->form_explicit_transpose = flg; 291 break; 292 default: 293 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 294 break; 295 } 296 PetscFunctionReturn(0); 297 } 298 299 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 300 301 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 302 { 303 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 304 IS isrow = b->row,iscol = b->col; 305 PetscBool row_identity,col_identity; 306 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 311 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 312 B->offloadmask = PETSC_OFFLOAD_CPU; 313 /* determine which version of MatSolve needs to be used. */ 314 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 315 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 316 if (row_identity && col_identity) { 317 if (!cusparsestruct->use_cpu_solve) { 318 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 319 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 320 } 321 B->ops->matsolve = NULL; 322 B->ops->matsolvetranspose = NULL; 323 } else { 324 if (!cusparsestruct->use_cpu_solve) { 325 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 326 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 327 } 328 B->ops->matsolve = NULL; 329 B->ops->matsolvetranspose = NULL; 330 } 331 332 /* get the triangular factors */ 333 if (!cusparsestruct->use_cpu_solve) { 334 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 340 { 341 PetscErrorCode ierr; 342 MatCUSPARSEStorageFormat format; 343 PetscBool flg; 344 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 345 346 PetscFunctionBegin; 347 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 348 if (A->factortype == MAT_FACTOR_NONE) { 349 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 350 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 351 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 352 353 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 354 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 355 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 356 ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 357 if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 358 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 359 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 360 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 361 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 362 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 363 PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 364 #else 365 PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 366 #endif 367 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 368 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 369 PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 370 371 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 372 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 373 PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 374 #endif 375 } 376 ierr = PetscOptionsTail();CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 381 { 382 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 383 PetscErrorCode ierr; 384 385 PetscFunctionBegin; 386 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 387 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 388 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 389 PetscFunctionReturn(0); 390 } 391 392 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 393 { 394 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 395 PetscErrorCode ierr; 396 397 PetscFunctionBegin; 398 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 399 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 400 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 401 PetscFunctionReturn(0); 402 } 403 404 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 405 { 406 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 407 PetscErrorCode ierr; 408 409 PetscFunctionBegin; 410 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 411 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 412 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 413 PetscFunctionReturn(0); 414 } 415 416 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 417 { 418 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 419 PetscErrorCode ierr; 420 421 PetscFunctionBegin; 422 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 423 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 424 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 425 PetscFunctionReturn(0); 426 } 427 428 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 429 { 430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 431 PetscInt n = A->rmap->n; 432 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 433 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 434 cusparseStatus_t stat; 435 const PetscInt *ai = a->i,*aj = a->j,*vi; 436 const MatScalar *aa = a->a,*v; 437 PetscInt *AiLo, *AjLo; 438 PetscInt i,nz, nzLower, offset, rowOffset; 439 PetscErrorCode ierr; 440 cudaError_t cerr; 441 442 PetscFunctionBegin; 443 if (!n) PetscFunctionReturn(0); 444 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 445 try { 446 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 447 nzLower=n+ai[n]-ai[1]; 448 if (!loTriFactor) { 449 PetscScalar *AALo; 450 451 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 452 453 /* Allocate Space for the lower triangular matrix */ 454 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 455 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 456 457 /* Fill the lower triangular matrix */ 458 AiLo[0] = (PetscInt) 0; 459 AiLo[n] = nzLower; 460 AjLo[0] = (PetscInt) 0; 461 AALo[0] = (MatScalar) 1.0; 462 v = aa; 463 vi = aj; 464 offset = 1; 465 rowOffset= 1; 466 for (i=1; i<n; i++) { 467 nz = ai[i+1] - ai[i]; 468 /* additional 1 for the term on the diagonal */ 469 AiLo[i] = rowOffset; 470 rowOffset += nz+1; 471 472 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 473 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 474 475 offset += nz; 476 AjLo[offset] = (PetscInt) i; 477 AALo[offset] = (MatScalar) 1.0; 478 offset += 1; 479 480 v += nz; 481 vi += nz; 482 } 483 484 /* allocate space for the triangular factor information */ 485 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 486 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 487 /* Create the matrix description */ 488 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 489 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 490 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 491 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 492 #else 493 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 494 #endif 495 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 496 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 497 498 /* set the operation */ 499 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 500 501 /* set the matrix */ 502 loTriFactor->csrMat = new CsrMatrix; 503 loTriFactor->csrMat->num_rows = n; 504 loTriFactor->csrMat->num_cols = n; 505 loTriFactor->csrMat->num_entries = nzLower; 506 507 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 508 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 509 510 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 511 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 512 513 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 514 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 515 516 /* Create the solve analysis information */ 517 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 518 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 519 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 520 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 521 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 522 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 523 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 524 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 525 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 526 #endif 527 528 /* perform the solve analysis */ 529 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 530 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 531 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 532 loTriFactor->csrMat->column_indices->data().get(), 533 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 534 loTriFactor->solveInfo, 535 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 536 #else 537 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 538 #endif 539 cerr = WaitForCUDA();CHKERRCUDA(cerr); 540 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 541 542 /* assign the pointer */ 543 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 544 loTriFactor->AA_h = AALo; 545 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 546 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 547 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 548 } else { /* update values only */ 549 if (!loTriFactor->AA_h) { 550 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 551 } 552 /* Fill the lower triangular matrix */ 553 loTriFactor->AA_h[0] = 1.0; 554 v = aa; 555 vi = aj; 556 offset = 1; 557 for (i=1; i<n; i++) { 558 nz = ai[i+1] - ai[i]; 559 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 560 offset += nz; 561 loTriFactor->AA_h[offset] = 1.0; 562 offset += 1; 563 v += nz; 564 } 565 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 566 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 567 } 568 } catch(char *ex) { 569 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 570 } 571 } 572 PetscFunctionReturn(0); 573 } 574 575 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 576 { 577 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 578 PetscInt n = A->rmap->n; 579 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 580 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 581 cusparseStatus_t stat; 582 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 583 const MatScalar *aa = a->a,*v; 584 PetscInt *AiUp, *AjUp; 585 PetscInt i,nz, nzUpper, offset; 586 PetscErrorCode ierr; 587 cudaError_t cerr; 588 589 PetscFunctionBegin; 590 if (!n) PetscFunctionReturn(0); 591 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 592 try { 593 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 594 nzUpper = adiag[0]-adiag[n]; 595 if (!upTriFactor) { 596 PetscScalar *AAUp; 597 598 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 599 600 /* Allocate Space for the upper triangular matrix */ 601 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 602 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 603 604 /* Fill the upper triangular matrix */ 605 AiUp[0]=(PetscInt) 0; 606 AiUp[n]=nzUpper; 607 offset = nzUpper; 608 for (i=n-1; i>=0; i--) { 609 v = aa + adiag[i+1] + 1; 610 vi = aj + adiag[i+1] + 1; 611 612 /* number of elements NOT on the diagonal */ 613 nz = adiag[i] - adiag[i+1]-1; 614 615 /* decrement the offset */ 616 offset -= (nz+1); 617 618 /* first, set the diagonal elements */ 619 AjUp[offset] = (PetscInt) i; 620 AAUp[offset] = (MatScalar)1./v[nz]; 621 AiUp[i] = AiUp[i+1] - (nz+1); 622 623 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 624 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 625 } 626 627 /* allocate space for the triangular factor information */ 628 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 629 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 630 631 /* Create the matrix description */ 632 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 633 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 634 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 635 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 636 #else 637 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 638 #endif 639 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 640 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 641 642 /* set the operation */ 643 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 644 645 /* set the matrix */ 646 upTriFactor->csrMat = new CsrMatrix; 647 upTriFactor->csrMat->num_rows = n; 648 upTriFactor->csrMat->num_cols = n; 649 upTriFactor->csrMat->num_entries = nzUpper; 650 651 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 652 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 653 654 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 655 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 656 657 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 658 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 659 660 /* Create the solve analysis information */ 661 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 662 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 663 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 664 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 665 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 666 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 667 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 668 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 669 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 670 #endif 671 672 /* perform the solve analysis */ 673 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 674 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 675 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 676 upTriFactor->csrMat->column_indices->data().get(), 677 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 678 upTriFactor->solveInfo, 679 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 680 #else 681 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 682 #endif 683 cerr = WaitForCUDA();CHKERRCUDA(cerr); 684 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 685 686 /* assign the pointer */ 687 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 688 upTriFactor->AA_h = AAUp; 689 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 690 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 691 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 692 } else { 693 if (!upTriFactor->AA_h) { 694 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 695 } 696 /* Fill the upper triangular matrix */ 697 offset = nzUpper; 698 for (i=n-1; i>=0; i--) { 699 v = aa + adiag[i+1] + 1; 700 701 /* number of elements NOT on the diagonal */ 702 nz = adiag[i] - adiag[i+1]-1; 703 704 /* decrement the offset */ 705 offset -= (nz+1); 706 707 /* first, set the diagonal elements */ 708 upTriFactor->AA_h[offset] = 1./v[nz]; 709 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 710 } 711 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 712 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 713 } 714 } catch(char *ex) { 715 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 716 } 717 } 718 PetscFunctionReturn(0); 719 } 720 721 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 722 { 723 PetscErrorCode ierr; 724 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 725 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 726 IS isrow = a->row,iscol = a->icol; 727 PetscBool row_identity,col_identity; 728 PetscInt n = A->rmap->n; 729 730 PetscFunctionBegin; 731 PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 732 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 733 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 734 735 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 736 cusparseTriFactors->nnz=a->nz; 737 738 A->offloadmask = PETSC_OFFLOAD_BOTH; 739 /* lower triangular indices */ 740 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 741 if (!row_identity && !cusparseTriFactors->rpermIndices) { 742 const PetscInt *r; 743 744 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 745 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 746 cusparseTriFactors->rpermIndices->assign(r, r+n); 747 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 748 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 749 } 750 751 /* upper triangular indices */ 752 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 753 if (!col_identity && !cusparseTriFactors->cpermIndices) { 754 const PetscInt *c; 755 756 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 757 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 758 cusparseTriFactors->cpermIndices->assign(c, c+n); 759 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 760 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 761 } 762 PetscFunctionReturn(0); 763 } 764 765 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 766 { 767 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 768 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 769 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 770 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 771 cusparseStatus_t stat; 772 PetscErrorCode ierr; 773 cudaError_t cerr; 774 PetscInt *AiUp, *AjUp; 775 PetscScalar *AAUp; 776 PetscScalar *AALo; 777 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 778 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 779 const PetscInt *ai = b->i,*aj = b->j,*vj; 780 const MatScalar *aa = b->a,*v; 781 782 PetscFunctionBegin; 783 if (!n) PetscFunctionReturn(0); 784 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 785 try { 786 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 787 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 788 if (!upTriFactor && !loTriFactor) { 789 /* Allocate Space for the upper triangular matrix */ 790 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 791 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 792 793 /* Fill the upper triangular matrix */ 794 AiUp[0]=(PetscInt) 0; 795 AiUp[n]=nzUpper; 796 offset = 0; 797 for (i=0; i<n; i++) { 798 /* set the pointers */ 799 v = aa + ai[i]; 800 vj = aj + ai[i]; 801 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 802 803 /* first, set the diagonal elements */ 804 AjUp[offset] = (PetscInt) i; 805 AAUp[offset] = (MatScalar)1.0/v[nz]; 806 AiUp[i] = offset; 807 AALo[offset] = (MatScalar)1.0/v[nz]; 808 809 offset+=1; 810 if (nz>0) { 811 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 812 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 813 for (j=offset; j<offset+nz; j++) { 814 AAUp[j] = -AAUp[j]; 815 AALo[j] = AAUp[j]/v[nz]; 816 } 817 offset+=nz; 818 } 819 } 820 821 /* allocate space for the triangular factor information */ 822 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 823 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 824 825 /* Create the matrix description */ 826 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 827 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 828 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 829 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 830 #else 831 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 832 #endif 833 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 834 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 835 836 /* set the matrix */ 837 upTriFactor->csrMat = new CsrMatrix; 838 upTriFactor->csrMat->num_rows = A->rmap->n; 839 upTriFactor->csrMat->num_cols = A->cmap->n; 840 upTriFactor->csrMat->num_entries = a->nz; 841 842 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 843 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 844 845 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 846 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 847 848 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 849 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 850 851 /* set the operation */ 852 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 853 854 /* Create the solve analysis information */ 855 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 856 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 857 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 858 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 859 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 860 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 861 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 862 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 863 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 864 #endif 865 866 /* perform the solve analysis */ 867 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 868 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 869 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 870 upTriFactor->csrMat->column_indices->data().get(), 871 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 872 upTriFactor->solveInfo, 873 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 874 #else 875 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 876 #endif 877 cerr = WaitForCUDA();CHKERRCUDA(cerr); 878 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 879 880 /* assign the pointer */ 881 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 882 883 /* allocate space for the triangular factor information */ 884 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 885 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 886 887 /* Create the matrix description */ 888 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 889 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 890 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 891 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 892 #else 893 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 894 #endif 895 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 896 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 897 898 /* set the operation */ 899 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 900 901 /* set the matrix */ 902 loTriFactor->csrMat = new CsrMatrix; 903 loTriFactor->csrMat->num_rows = A->rmap->n; 904 loTriFactor->csrMat->num_cols = A->cmap->n; 905 loTriFactor->csrMat->num_entries = a->nz; 906 907 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 908 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 909 910 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 911 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 912 913 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 914 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 915 916 /* Create the solve analysis information */ 917 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 918 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 919 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 920 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 921 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 922 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 923 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 924 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 925 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 926 #endif 927 928 /* perform the solve analysis */ 929 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 930 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 931 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 932 loTriFactor->csrMat->column_indices->data().get(), 933 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 934 loTriFactor->solveInfo, 935 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 936 #else 937 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 938 #endif 939 cerr = WaitForCUDA();CHKERRCUDA(cerr); 940 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 941 942 /* assign the pointer */ 943 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 944 945 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 946 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 947 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 948 } else { 949 /* Fill the upper triangular matrix */ 950 offset = 0; 951 for (i=0; i<n; i++) { 952 /* set the pointers */ 953 v = aa + ai[i]; 954 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 955 956 /* first, set the diagonal elements */ 957 AAUp[offset] = 1.0/v[nz]; 958 AALo[offset] = 1.0/v[nz]; 959 960 offset+=1; 961 if (nz>0) { 962 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 963 for (j=offset; j<offset+nz; j++) { 964 AAUp[j] = -AAUp[j]; 965 AALo[j] = AAUp[j]/v[nz]; 966 } 967 offset+=nz; 968 } 969 } 970 PetscCheckFalse(!upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 971 PetscCheckFalse(!loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 972 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 973 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 974 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 975 } 976 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 977 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 978 } catch(char *ex) { 979 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 980 } 981 } 982 PetscFunctionReturn(0); 983 } 984 985 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 986 { 987 PetscErrorCode ierr; 988 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 989 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 990 IS ip = a->row; 991 PetscBool perm_identity; 992 PetscInt n = A->rmap->n; 993 994 PetscFunctionBegin; 995 PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 996 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 997 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 998 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 999 1000 A->offloadmask = PETSC_OFFLOAD_BOTH; 1001 1002 /* lower triangular indices */ 1003 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1004 if (!perm_identity) { 1005 IS iip; 1006 const PetscInt *irip,*rip; 1007 1008 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 1009 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1010 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1011 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1012 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1013 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1014 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 1015 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 1016 ierr = ISDestroy(&iip);CHKERRQ(ierr); 1017 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1018 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1019 } 1020 PetscFunctionReturn(0); 1021 } 1022 1023 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1024 { 1025 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1026 IS ip = b->row; 1027 PetscBool perm_identity; 1028 PetscErrorCode ierr; 1029 1030 PetscFunctionBegin; 1031 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1032 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1033 B->offloadmask = PETSC_OFFLOAD_CPU; 1034 /* determine which version of MatSolve needs to be used. */ 1035 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1036 if (perm_identity) { 1037 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1038 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1039 B->ops->matsolve = NULL; 1040 B->ops->matsolvetranspose = NULL; 1041 } else { 1042 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1043 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1044 B->ops->matsolve = NULL; 1045 B->ops->matsolvetranspose = NULL; 1046 } 1047 1048 /* get the triangular factors */ 1049 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 1053 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1054 { 1055 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1056 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1057 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1058 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1059 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1060 cusparseStatus_t stat; 1061 cusparseIndexBase_t indexBase; 1062 cusparseMatrixType_t matrixType; 1063 cusparseFillMode_t fillMode; 1064 cusparseDiagType_t diagType; 1065 cudaError_t cerr; 1066 PetscErrorCode ierr; 1067 1068 PetscFunctionBegin; 1069 /* allocate space for the transpose of the lower triangular factor */ 1070 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1071 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1072 1073 /* set the matrix descriptors of the lower triangular factor */ 1074 matrixType = cusparseGetMatType(loTriFactor->descr); 1075 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1076 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1077 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1078 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1079 1080 /* Create the matrix description */ 1081 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1082 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1083 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1084 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1085 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1086 1087 /* set the operation */ 1088 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1089 1090 /* allocate GPU space for the CSC of the lower triangular factor*/ 1091 loTriFactorT->csrMat = new CsrMatrix; 1092 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1093 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1094 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1095 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1096 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1097 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1098 1099 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1100 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1101 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1102 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1103 loTriFactor->csrMat->values->data().get(), 1104 loTriFactor->csrMat->row_offsets->data().get(), 1105 loTriFactor->csrMat->column_indices->data().get(), 1106 loTriFactorT->csrMat->values->data().get(), 1107 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1108 CUSPARSE_ACTION_NUMERIC,indexBase, 1109 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1110 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1111 #endif 1112 1113 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1114 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1115 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1116 loTriFactor->csrMat->values->data().get(), 1117 loTriFactor->csrMat->row_offsets->data().get(), 1118 loTriFactor->csrMat->column_indices->data().get(), 1119 loTriFactorT->csrMat->values->data().get(), 1120 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1121 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1122 CUSPARSE_ACTION_NUMERIC, indexBase, 1123 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1124 #else 1125 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1126 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1127 #endif 1128 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1129 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1130 1131 /* Create the solve analysis information */ 1132 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1133 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1134 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1135 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1136 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1137 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1138 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1139 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1140 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1141 #endif 1142 1143 /* perform the solve analysis */ 1144 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1145 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1146 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1147 loTriFactorT->csrMat->column_indices->data().get(), 1148 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1149 loTriFactorT->solveInfo, 1150 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1151 #else 1152 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1153 #endif 1154 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1155 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1156 1157 /* assign the pointer */ 1158 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1159 1160 /*********************************************/ 1161 /* Now the Transpose of the Upper Tri Factor */ 1162 /*********************************************/ 1163 1164 /* allocate space for the transpose of the upper triangular factor */ 1165 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1166 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1167 1168 /* set the matrix descriptors of the upper triangular factor */ 1169 matrixType = cusparseGetMatType(upTriFactor->descr); 1170 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1171 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1172 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1173 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1174 1175 /* Create the matrix description */ 1176 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1177 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1178 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1179 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1180 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1181 1182 /* set the operation */ 1183 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1184 1185 /* allocate GPU space for the CSC of the upper triangular factor*/ 1186 upTriFactorT->csrMat = new CsrMatrix; 1187 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1188 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1189 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1190 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1191 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1192 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1193 1194 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1195 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1196 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1197 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1198 upTriFactor->csrMat->values->data().get(), 1199 upTriFactor->csrMat->row_offsets->data().get(), 1200 upTriFactor->csrMat->column_indices->data().get(), 1201 upTriFactorT->csrMat->values->data().get(), 1202 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1203 CUSPARSE_ACTION_NUMERIC,indexBase, 1204 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1205 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1206 #endif 1207 1208 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1209 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1210 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1211 upTriFactor->csrMat->values->data().get(), 1212 upTriFactor->csrMat->row_offsets->data().get(), 1213 upTriFactor->csrMat->column_indices->data().get(), 1214 upTriFactorT->csrMat->values->data().get(), 1215 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1216 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1217 CUSPARSE_ACTION_NUMERIC, indexBase, 1218 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1219 #else 1220 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1221 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1222 #endif 1223 1224 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1225 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1226 1227 /* Create the solve analysis information */ 1228 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1229 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1230 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1231 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1232 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1233 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1234 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1235 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1236 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1237 #endif 1238 1239 /* perform the solve analysis */ 1240 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1241 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1242 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1243 upTriFactorT->csrMat->column_indices->data().get(), 1244 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1245 upTriFactorT->solveInfo, 1246 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1247 #else 1248 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1249 #endif 1250 1251 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1252 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1253 1254 /* assign the pointer */ 1255 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1256 PetscFunctionReturn(0); 1257 } 1258 1259 struct PetscScalarToPetscInt 1260 { 1261 __host__ __device__ 1262 PetscInt operator()(PetscScalar s) 1263 { 1264 return (PetscInt)PetscRealPart(s); 1265 } 1266 }; 1267 1268 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1269 { 1270 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1271 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1273 cusparseStatus_t stat; 1274 cusparseIndexBase_t indexBase; 1275 cudaError_t err; 1276 PetscErrorCode ierr; 1277 1278 PetscFunctionBegin; 1279 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1280 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1281 PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1282 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1283 PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1284 if (A->transupdated) PetscFunctionReturn(0); 1285 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1286 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1287 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1288 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1289 } 1290 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1291 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1292 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1293 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1294 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1295 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1296 1297 /* set alpha and beta */ 1298 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1299 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1300 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1301 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1302 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1303 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1304 1305 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1306 CsrMatrix *matrixT = new CsrMatrix; 1307 matstructT->mat = matrixT; 1308 matrixT->num_rows = A->cmap->n; 1309 matrixT->num_cols = A->rmap->n; 1310 matrixT->num_entries = a->nz; 1311 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1312 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1313 matrixT->values = new THRUSTARRAY(a->nz); 1314 1315 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1316 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1317 1318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1319 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1320 stat = cusparseCreateCsr(&matstructT->matDescr, 1321 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1322 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1323 matrixT->values->data().get(), 1324 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1325 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1326 #else 1327 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1328 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1329 1330 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1331 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1332 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1333 */ 1334 if (matrixT->num_entries) { 1335 stat = cusparseCreateCsr(&matstructT->matDescr, 1336 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1337 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1338 matrixT->values->data().get(), 1339 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1340 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1341 1342 } else { 1343 matstructT->matDescr = NULL; 1344 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1345 } 1346 #endif 1347 #endif 1348 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1349 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1350 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1351 #else 1352 CsrMatrix *temp = new CsrMatrix; 1353 CsrMatrix *tempT = new CsrMatrix; 1354 /* First convert HYB to CSR */ 1355 temp->num_rows = A->rmap->n; 1356 temp->num_cols = A->cmap->n; 1357 temp->num_entries = a->nz; 1358 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1359 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1360 temp->values = new THRUSTARRAY(a->nz); 1361 1362 stat = cusparse_hyb2csr(cusparsestruct->handle, 1363 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1364 temp->values->data().get(), 1365 temp->row_offsets->data().get(), 1366 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1367 1368 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1369 tempT->num_rows = A->rmap->n; 1370 tempT->num_cols = A->cmap->n; 1371 tempT->num_entries = a->nz; 1372 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1373 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1374 tempT->values = new THRUSTARRAY(a->nz); 1375 1376 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1377 temp->num_cols, temp->num_entries, 1378 temp->values->data().get(), 1379 temp->row_offsets->data().get(), 1380 temp->column_indices->data().get(), 1381 tempT->values->data().get(), 1382 tempT->column_indices->data().get(), 1383 tempT->row_offsets->data().get(), 1384 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1385 1386 /* Last, convert CSC to HYB */ 1387 cusparseHybMat_t hybMat; 1388 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1389 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1390 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1391 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1392 matstructT->descr, tempT->values->data().get(), 1393 tempT->row_offsets->data().get(), 1394 tempT->column_indices->data().get(), 1395 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1396 1397 /* assign the pointer */ 1398 matstructT->mat = hybMat; 1399 A->transupdated = PETSC_TRUE; 1400 /* delete temporaries */ 1401 if (tempT) { 1402 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1403 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1404 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1405 delete (CsrMatrix*) tempT; 1406 } 1407 if (temp) { 1408 if (temp->values) delete (THRUSTARRAY*) temp->values; 1409 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1410 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1411 delete (CsrMatrix*) temp; 1412 } 1413 #endif 1414 } 1415 } 1416 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1417 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1418 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1419 PetscCheckFalse(!matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1420 PetscCheckFalse(!matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1421 PetscCheckFalse(!matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1422 PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1423 PetscCheckFalse(!matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1424 PetscCheckFalse(!matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1425 PetscCheckFalse(!matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1426 PetscCheckFalse(!matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1427 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1428 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1429 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1430 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1431 } 1432 if (!cusparsestruct->csr2csc_i) { 1433 THRUSTARRAY csr2csc_a(matrix->num_entries); 1434 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1435 1436 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1438 void *csr2cscBuffer; 1439 size_t csr2cscBufferSize; 1440 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1441 A->cmap->n, matrix->num_entries, 1442 matrix->values->data().get(), 1443 cusparsestruct->rowoffsets_gpu->data().get(), 1444 matrix->column_indices->data().get(), 1445 matrixT->values->data().get(), 1446 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1447 CUSPARSE_ACTION_NUMERIC,indexBase, 1448 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1449 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1450 #endif 1451 1452 if (matrix->num_entries) { 1453 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1454 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1455 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1456 1457 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1458 should be filled with indexBase. So I just take a shortcut here. 1459 */ 1460 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1461 A->cmap->n,matrix->num_entries, 1462 csr2csc_a.data().get(), 1463 cusparsestruct->rowoffsets_gpu->data().get(), 1464 matrix->column_indices->data().get(), 1465 matrixT->values->data().get(), 1466 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1467 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1468 CUSPARSE_ACTION_NUMERIC,indexBase, 1469 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1470 #else 1471 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1472 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1473 #endif 1474 } else { 1475 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1476 } 1477 1478 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1479 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1480 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1481 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1482 #endif 1483 } 1484 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1485 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1486 matrixT->values->begin())); 1487 } 1488 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1489 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1490 /* the compressed row indices is not used for matTranspose */ 1491 matstructT->cprowIndices = NULL; 1492 /* assign the pointer */ 1493 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1494 A->transupdated = PETSC_TRUE; 1495 PetscFunctionReturn(0); 1496 } 1497 1498 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1499 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1500 { 1501 PetscInt n = xx->map->n; 1502 const PetscScalar *barray; 1503 PetscScalar *xarray; 1504 thrust::device_ptr<const PetscScalar> bGPU; 1505 thrust::device_ptr<PetscScalar> xGPU; 1506 cusparseStatus_t stat; 1507 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1508 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1510 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1511 PetscErrorCode ierr; 1512 1513 PetscFunctionBegin; 1514 /* Analyze the matrix and create the transpose ... on the fly */ 1515 if (!loTriFactorT && !upTriFactorT) { 1516 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1517 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1518 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1519 } 1520 1521 /* Get the GPU pointers */ 1522 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1523 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1524 xGPU = thrust::device_pointer_cast(xarray); 1525 bGPU = thrust::device_pointer_cast(barray); 1526 1527 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1528 /* First, reorder with the row permutation */ 1529 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1530 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1531 xGPU); 1532 1533 /* First, solve U */ 1534 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1535 upTriFactorT->csrMat->num_rows, 1536 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1537 upTriFactorT->csrMat->num_entries, 1538 #endif 1539 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1540 upTriFactorT->csrMat->values->data().get(), 1541 upTriFactorT->csrMat->row_offsets->data().get(), 1542 upTriFactorT->csrMat->column_indices->data().get(), 1543 upTriFactorT->solveInfo, 1544 xarray, 1545 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1546 tempGPU->data().get(), 1547 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1548 #else 1549 tempGPU->data().get());CHKERRCUSPARSE(stat); 1550 #endif 1551 1552 /* Then, solve L */ 1553 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1554 loTriFactorT->csrMat->num_rows, 1555 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1556 loTriFactorT->csrMat->num_entries, 1557 #endif 1558 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1559 loTriFactorT->csrMat->values->data().get(), 1560 loTriFactorT->csrMat->row_offsets->data().get(), 1561 loTriFactorT->csrMat->column_indices->data().get(), 1562 loTriFactorT->solveInfo, 1563 tempGPU->data().get(), 1564 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1565 xarray, 1566 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1567 #else 1568 xarray);CHKERRCUSPARSE(stat); 1569 #endif 1570 1571 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1572 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1573 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1574 tempGPU->begin()); 1575 1576 /* Copy the temporary to the full solution. */ 1577 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1578 1579 /* restore */ 1580 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1581 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1582 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1583 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1584 PetscFunctionReturn(0); 1585 } 1586 1587 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1588 { 1589 const PetscScalar *barray; 1590 PetscScalar *xarray; 1591 cusparseStatus_t stat; 1592 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1593 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1594 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1595 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1596 PetscErrorCode ierr; 1597 1598 PetscFunctionBegin; 1599 /* Analyze the matrix and create the transpose ... on the fly */ 1600 if (!loTriFactorT && !upTriFactorT) { 1601 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1602 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1603 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1604 } 1605 1606 /* Get the GPU pointers */ 1607 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1608 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1609 1610 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1611 /* First, solve U */ 1612 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1613 upTriFactorT->csrMat->num_rows, 1614 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1615 upTriFactorT->csrMat->num_entries, 1616 #endif 1617 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1618 upTriFactorT->csrMat->values->data().get(), 1619 upTriFactorT->csrMat->row_offsets->data().get(), 1620 upTriFactorT->csrMat->column_indices->data().get(), 1621 upTriFactorT->solveInfo, 1622 barray, 1623 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1624 tempGPU->data().get(), 1625 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1626 #else 1627 tempGPU->data().get());CHKERRCUSPARSE(stat); 1628 #endif 1629 1630 /* Then, solve L */ 1631 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1632 loTriFactorT->csrMat->num_rows, 1633 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634 loTriFactorT->csrMat->num_entries, 1635 #endif 1636 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1637 loTriFactorT->csrMat->values->data().get(), 1638 loTriFactorT->csrMat->row_offsets->data().get(), 1639 loTriFactorT->csrMat->column_indices->data().get(), 1640 loTriFactorT->solveInfo, 1641 tempGPU->data().get(), 1642 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1643 xarray, 1644 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1645 #else 1646 xarray);CHKERRCUSPARSE(stat); 1647 #endif 1648 1649 /* restore */ 1650 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1651 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1652 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1653 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1654 PetscFunctionReturn(0); 1655 } 1656 1657 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1658 { 1659 const PetscScalar *barray; 1660 PetscScalar *xarray; 1661 thrust::device_ptr<const PetscScalar> bGPU; 1662 thrust::device_ptr<PetscScalar> xGPU; 1663 cusparseStatus_t stat; 1664 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1665 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1666 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1667 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1668 PetscErrorCode ierr; 1669 1670 PetscFunctionBegin; 1671 1672 /* Get the GPU pointers */ 1673 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1674 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1675 xGPU = thrust::device_pointer_cast(xarray); 1676 bGPU = thrust::device_pointer_cast(barray); 1677 1678 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1679 /* First, reorder with the row permutation */ 1680 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1681 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1682 tempGPU->begin()); 1683 1684 /* Next, solve L */ 1685 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1686 loTriFactor->csrMat->num_rows, 1687 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1688 loTriFactor->csrMat->num_entries, 1689 #endif 1690 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1691 loTriFactor->csrMat->values->data().get(), 1692 loTriFactor->csrMat->row_offsets->data().get(), 1693 loTriFactor->csrMat->column_indices->data().get(), 1694 loTriFactor->solveInfo, 1695 tempGPU->data().get(), 1696 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1697 xarray, 1698 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1699 #else 1700 xarray);CHKERRCUSPARSE(stat); 1701 #endif 1702 1703 /* Then, solve U */ 1704 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1705 upTriFactor->csrMat->num_rows, 1706 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1707 upTriFactor->csrMat->num_entries, 1708 #endif 1709 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1710 upTriFactor->csrMat->values->data().get(), 1711 upTriFactor->csrMat->row_offsets->data().get(), 1712 upTriFactor->csrMat->column_indices->data().get(), 1713 upTriFactor->solveInfo,xarray, 1714 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1715 tempGPU->data().get(), 1716 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1717 #else 1718 tempGPU->data().get());CHKERRCUSPARSE(stat); 1719 #endif 1720 1721 /* Last, reorder with the column permutation */ 1722 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1723 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1724 xGPU); 1725 1726 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1727 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1728 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1729 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1730 PetscFunctionReturn(0); 1731 } 1732 1733 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1734 { 1735 const PetscScalar *barray; 1736 PetscScalar *xarray; 1737 cusparseStatus_t stat; 1738 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1739 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1740 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1741 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1742 PetscErrorCode ierr; 1743 1744 PetscFunctionBegin; 1745 /* Get the GPU pointers */ 1746 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1747 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1748 1749 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1750 /* First, solve L */ 1751 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1752 loTriFactor->csrMat->num_rows, 1753 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1754 loTriFactor->csrMat->num_entries, 1755 #endif 1756 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1757 loTriFactor->csrMat->values->data().get(), 1758 loTriFactor->csrMat->row_offsets->data().get(), 1759 loTriFactor->csrMat->column_indices->data().get(), 1760 loTriFactor->solveInfo, 1761 barray, 1762 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1763 tempGPU->data().get(), 1764 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1765 #else 1766 tempGPU->data().get());CHKERRCUSPARSE(stat); 1767 #endif 1768 1769 /* Next, solve U */ 1770 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1771 upTriFactor->csrMat->num_rows, 1772 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1773 upTriFactor->csrMat->num_entries, 1774 #endif 1775 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1776 upTriFactor->csrMat->values->data().get(), 1777 upTriFactor->csrMat->row_offsets->data().get(), 1778 upTriFactor->csrMat->column_indices->data().get(), 1779 upTriFactor->solveInfo, 1780 tempGPU->data().get(), 1781 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1782 xarray, 1783 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1784 #else 1785 xarray);CHKERRCUSPARSE(stat); 1786 #endif 1787 1788 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1789 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1790 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1791 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1792 PetscFunctionReturn(0); 1793 } 1794 1795 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1796 { 1797 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1798 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1799 cudaError_t cerr; 1800 PetscErrorCode ierr; 1801 1802 PetscFunctionBegin; 1803 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1804 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1805 1806 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1807 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1808 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1809 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1810 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1811 A->offloadmask = PETSC_OFFLOAD_BOTH; 1812 } 1813 PetscFunctionReturn(0); 1814 } 1815 1816 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1817 { 1818 PetscErrorCode ierr; 1819 1820 PetscFunctionBegin; 1821 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1822 *array = ((Mat_SeqAIJ*)A->data)->a; 1823 PetscFunctionReturn(0); 1824 } 1825 1826 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1827 { 1828 PetscFunctionBegin; 1829 A->offloadmask = PETSC_OFFLOAD_CPU; 1830 *array = NULL; 1831 PetscFunctionReturn(0); 1832 } 1833 1834 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1835 { 1836 PetscErrorCode ierr; 1837 1838 PetscFunctionBegin; 1839 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1840 *array = ((Mat_SeqAIJ*)A->data)->a; 1841 PetscFunctionReturn(0); 1842 } 1843 1844 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1845 { 1846 PetscFunctionBegin; 1847 *array = NULL; 1848 PetscFunctionReturn(0); 1849 } 1850 1851 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1852 { 1853 PetscFunctionBegin; 1854 *array = ((Mat_SeqAIJ*)A->data)->a; 1855 PetscFunctionReturn(0); 1856 } 1857 1858 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1859 { 1860 PetscFunctionBegin; 1861 A->offloadmask = PETSC_OFFLOAD_CPU; 1862 *array = NULL; 1863 PetscFunctionReturn(0); 1864 } 1865 1866 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1867 { 1868 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1869 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1870 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1871 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1872 PetscErrorCode ierr; 1873 cusparseStatus_t stat; 1874 PetscBool both = PETSC_TRUE; 1875 cudaError_t err; 1876 1877 PetscFunctionBegin; 1878 PetscCheckFalse(A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1879 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1880 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1881 CsrMatrix *matrix; 1882 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1883 1884 PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1885 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1886 matrix->values->assign(a->a, a->a+a->nz); 1887 err = WaitForCUDA();CHKERRCUDA(err); 1888 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1889 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1890 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1891 } else { 1892 PetscInt nnz; 1893 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1894 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1895 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1896 delete cusparsestruct->workVector; 1897 delete cusparsestruct->rowoffsets_gpu; 1898 cusparsestruct->workVector = NULL; 1899 cusparsestruct->rowoffsets_gpu = NULL; 1900 try { 1901 if (a->compressedrow.use) { 1902 m = a->compressedrow.nrows; 1903 ii = a->compressedrow.i; 1904 ridx = a->compressedrow.rindex; 1905 } else { 1906 m = A->rmap->n; 1907 ii = a->i; 1908 ridx = NULL; 1909 } 1910 PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1911 PetscCheckFalse(m && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1912 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1913 else nnz = a->nz; 1914 1915 /* create cusparse matrix */ 1916 cusparsestruct->nrows = m; 1917 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1918 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1919 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1920 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1921 1922 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1923 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1924 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1925 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1926 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1927 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1928 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1929 1930 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1931 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1932 /* set the matrix */ 1933 CsrMatrix *mat= new CsrMatrix; 1934 mat->num_rows = m; 1935 mat->num_cols = A->cmap->n; 1936 mat->num_entries = nnz; 1937 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1938 mat->row_offsets->assign(ii, ii + m+1); 1939 1940 mat->column_indices = new THRUSTINTARRAY32(nnz); 1941 mat->column_indices->assign(a->j, a->j+nnz); 1942 1943 mat->values = new THRUSTARRAY(nnz); 1944 if (a->a) mat->values->assign(a->a, a->a+nnz); 1945 1946 /* assign the pointer */ 1947 matstruct->mat = mat; 1948 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1949 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1950 stat = cusparseCreateCsr(&matstruct->matDescr, 1951 mat->num_rows, mat->num_cols, mat->num_entries, 1952 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1953 mat->values->data().get(), 1954 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1955 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1956 } 1957 #endif 1958 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1959 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1960 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1961 #else 1962 CsrMatrix *mat= new CsrMatrix; 1963 mat->num_rows = m; 1964 mat->num_cols = A->cmap->n; 1965 mat->num_entries = nnz; 1966 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1967 mat->row_offsets->assign(ii, ii + m+1); 1968 1969 mat->column_indices = new THRUSTINTARRAY32(nnz); 1970 mat->column_indices->assign(a->j, a->j+nnz); 1971 1972 mat->values = new THRUSTARRAY(nnz); 1973 if (a->a) mat->values->assign(a->a, a->a+nnz); 1974 1975 cusparseHybMat_t hybMat; 1976 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1977 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1978 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1979 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1980 matstruct->descr, mat->values->data().get(), 1981 mat->row_offsets->data().get(), 1982 mat->column_indices->data().get(), 1983 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1984 /* assign the pointer */ 1985 matstruct->mat = hybMat; 1986 1987 if (mat) { 1988 if (mat->values) delete (THRUSTARRAY*)mat->values; 1989 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1990 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1991 delete (CsrMatrix*)mat; 1992 } 1993 #endif 1994 } 1995 1996 /* assign the compressed row indices */ 1997 if (a->compressedrow.use) { 1998 cusparsestruct->workVector = new THRUSTARRAY(m); 1999 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2000 matstruct->cprowIndices->assign(ridx,ridx+m); 2001 tmp = m; 2002 } else { 2003 cusparsestruct->workVector = NULL; 2004 matstruct->cprowIndices = NULL; 2005 tmp = 0; 2006 } 2007 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2008 2009 /* assign the pointer */ 2010 cusparsestruct->mat = matstruct; 2011 } catch(char *ex) { 2012 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2013 } 2014 err = WaitForCUDA();CHKERRCUDA(err); 2015 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 2016 cusparsestruct->nonzerostate = A->nonzerostate; 2017 } 2018 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2019 } 2020 PetscFunctionReturn(0); 2021 } 2022 2023 struct VecCUDAPlusEquals 2024 { 2025 template <typename Tuple> 2026 __host__ __device__ 2027 void operator()(Tuple t) 2028 { 2029 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2030 } 2031 }; 2032 2033 struct VecCUDAEquals 2034 { 2035 template <typename Tuple> 2036 __host__ __device__ 2037 void operator()(Tuple t) 2038 { 2039 thrust::get<1>(t) = thrust::get<0>(t); 2040 } 2041 }; 2042 2043 struct VecCUDAEqualsReverse 2044 { 2045 template <typename Tuple> 2046 __host__ __device__ 2047 void operator()(Tuple t) 2048 { 2049 thrust::get<0>(t) = thrust::get<1>(t); 2050 } 2051 }; 2052 2053 struct MatMatCusparse { 2054 PetscBool cisdense; 2055 PetscScalar *Bt; 2056 Mat X; 2057 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2058 PetscLogDouble flops; 2059 CsrMatrix *Bcsr; 2060 2061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2062 cusparseSpMatDescr_t matSpBDescr; 2063 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2064 cusparseDnMatDescr_t matBDescr; 2065 cusparseDnMatDescr_t matCDescr; 2066 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2067 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2068 void *dBuffer4; 2069 void *dBuffer5; 2070 #endif 2071 size_t mmBufferSize; 2072 void *mmBuffer; 2073 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2074 cusparseSpGEMMDescr_t spgemmDesc; 2075 #endif 2076 }; 2077 2078 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2079 { 2080 PetscErrorCode ierr; 2081 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2082 cudaError_t cerr; 2083 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2084 cusparseStatus_t stat; 2085 #endif 2086 2087 PetscFunctionBegin; 2088 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2089 delete mmdata->Bcsr; 2090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2091 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2092 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2093 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2094 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2095 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2096 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2097 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2098 #endif 2099 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2100 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2101 #endif 2102 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2103 ierr = PetscFree(data);CHKERRQ(ierr); 2104 PetscFunctionReturn(0); 2105 } 2106 2107 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2108 2109 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2110 { 2111 Mat_Product *product = C->product; 2112 Mat A,B; 2113 PetscInt m,n,blda,clda; 2114 PetscBool flg,biscuda; 2115 Mat_SeqAIJCUSPARSE *cusp; 2116 cusparseStatus_t stat; 2117 cusparseOperation_t opA; 2118 const PetscScalar *barray; 2119 PetscScalar *carray; 2120 PetscErrorCode ierr; 2121 MatMatCusparse *mmdata; 2122 Mat_SeqAIJCUSPARSEMultStruct *mat; 2123 CsrMatrix *csrmat; 2124 2125 PetscFunctionBegin; 2126 MatCheckProduct(C,1); 2127 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2128 mmdata = (MatMatCusparse*)product->data; 2129 A = product->A; 2130 B = product->B; 2131 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2132 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2133 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2134 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2135 PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2136 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2137 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2138 switch (product->type) { 2139 case MATPRODUCT_AB: 2140 case MATPRODUCT_PtAP: 2141 mat = cusp->mat; 2142 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2143 m = A->rmap->n; 2144 n = B->cmap->n; 2145 break; 2146 case MATPRODUCT_AtB: 2147 if (!A->form_explicit_transpose) { 2148 mat = cusp->mat; 2149 opA = CUSPARSE_OPERATION_TRANSPOSE; 2150 } else { 2151 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2152 mat = cusp->matTranspose; 2153 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2154 } 2155 m = A->cmap->n; 2156 n = B->cmap->n; 2157 break; 2158 case MATPRODUCT_ABt: 2159 case MATPRODUCT_RARt: 2160 mat = cusp->mat; 2161 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2162 m = A->rmap->n; 2163 n = B->rmap->n; 2164 break; 2165 default: 2166 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2167 } 2168 PetscCheckFalse(!mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2169 csrmat = (CsrMatrix*)mat->mat; 2170 /* if the user passed a CPU matrix, copy the data to the GPU */ 2171 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2172 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2173 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2174 2175 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2176 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2177 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2178 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2179 } else { 2180 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2181 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2182 } 2183 2184 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2185 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2186 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2187 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2188 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2189 size_t mmBufferSize; 2190 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2191 if (!mmdata->matBDescr) { 2192 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2193 mmdata->Blda = blda; 2194 } 2195 2196 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2197 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2198 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2199 mmdata->Clda = clda; 2200 } 2201 2202 if (!mat->matDescr) { 2203 stat = cusparseCreateCsr(&mat->matDescr, 2204 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2205 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2206 csrmat->values->data().get(), 2207 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2208 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2209 } 2210 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2211 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2212 mmdata->matCDescr,cusparse_scalartype, 2213 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2214 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2215 cudaError_t cerr; 2216 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2217 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2218 mmdata->mmBufferSize = mmBufferSize; 2219 } 2220 mmdata->initialized = PETSC_TRUE; 2221 } else { 2222 /* to be safe, always update pointers of the mats */ 2223 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2224 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2225 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2226 } 2227 2228 /* do cusparseSpMM, which supports transpose on B */ 2229 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2230 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2231 mmdata->matCDescr,cusparse_scalartype, 2232 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2233 #else 2234 PetscInt k; 2235 /* cusparseXcsrmm does not support transpose on B */ 2236 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2237 cublasHandle_t cublasv2handle; 2238 cublasStatus_t cerr; 2239 2240 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2241 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2242 B->cmap->n,B->rmap->n, 2243 &PETSC_CUSPARSE_ONE ,barray,blda, 2244 &PETSC_CUSPARSE_ZERO,barray,blda, 2245 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2246 blda = B->cmap->n; 2247 k = B->cmap->n; 2248 } else { 2249 k = B->rmap->n; 2250 } 2251 2252 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2253 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2254 csrmat->num_entries,mat->alpha_one,mat->descr, 2255 csrmat->values->data().get(), 2256 csrmat->row_offsets->data().get(), 2257 csrmat->column_indices->data().get(), 2258 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2259 carray,clda);CHKERRCUSPARSE(stat); 2260 #endif 2261 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2262 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2263 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2264 if (product->type == MATPRODUCT_RARt) { 2265 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2266 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2267 } else if (product->type == MATPRODUCT_PtAP) { 2268 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2269 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2270 } else { 2271 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2272 } 2273 if (mmdata->cisdense) { 2274 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2275 } 2276 if (!biscuda) { 2277 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2278 } 2279 PetscFunctionReturn(0); 2280 } 2281 2282 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2283 { 2284 Mat_Product *product = C->product; 2285 Mat A,B; 2286 PetscInt m,n; 2287 PetscBool cisdense,flg; 2288 PetscErrorCode ierr; 2289 MatMatCusparse *mmdata; 2290 Mat_SeqAIJCUSPARSE *cusp; 2291 2292 PetscFunctionBegin; 2293 MatCheckProduct(C,1); 2294 PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2295 A = product->A; 2296 B = product->B; 2297 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2298 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2299 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2300 PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2301 switch (product->type) { 2302 case MATPRODUCT_AB: 2303 m = A->rmap->n; 2304 n = B->cmap->n; 2305 break; 2306 case MATPRODUCT_AtB: 2307 m = A->cmap->n; 2308 n = B->cmap->n; 2309 break; 2310 case MATPRODUCT_ABt: 2311 m = A->rmap->n; 2312 n = B->rmap->n; 2313 break; 2314 case MATPRODUCT_PtAP: 2315 m = B->cmap->n; 2316 n = B->cmap->n; 2317 break; 2318 case MATPRODUCT_RARt: 2319 m = B->rmap->n; 2320 n = B->rmap->n; 2321 break; 2322 default: 2323 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2324 } 2325 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2326 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2327 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2328 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2329 2330 /* product data */ 2331 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2332 mmdata->cisdense = cisdense; 2333 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2334 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2335 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2336 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2337 } 2338 #endif 2339 /* for these products we need intermediate storage */ 2340 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2341 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2342 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2343 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2344 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2345 } else { 2346 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2347 } 2348 } 2349 C->product->data = mmdata; 2350 C->product->destroy = MatDestroy_MatMatCusparse; 2351 2352 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2353 PetscFunctionReturn(0); 2354 } 2355 2356 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2357 { 2358 Mat_Product *product = C->product; 2359 Mat A,B; 2360 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2361 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2362 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2363 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2364 PetscBool flg; 2365 PetscErrorCode ierr; 2366 cusparseStatus_t stat; 2367 cudaError_t cerr; 2368 MatProductType ptype; 2369 MatMatCusparse *mmdata; 2370 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2371 cusparseSpMatDescr_t BmatSpDescr; 2372 #endif 2373 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2374 2375 PetscFunctionBegin; 2376 MatCheckProduct(C,1); 2377 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2378 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2379 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2380 mmdata = (MatMatCusparse*)C->product->data; 2381 A = product->A; 2382 B = product->B; 2383 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2384 mmdata->reusesym = PETSC_FALSE; 2385 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2386 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2387 Cmat = Ccusp->mat; 2388 PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2389 Ccsr = (CsrMatrix*)Cmat->mat; 2390 PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2391 goto finalize; 2392 } 2393 if (!c->nz) goto finalize; 2394 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2395 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2396 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2397 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2398 PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2399 PetscCheckFalse(B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2400 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2401 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2402 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2403 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2404 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2405 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2406 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2407 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2408 2409 ptype = product->type; 2410 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2411 ptype = MATPRODUCT_AB; 2412 PetscCheckFalse(!product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2413 } 2414 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2415 ptype = MATPRODUCT_AB; 2416 PetscCheckFalse(!product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2417 } 2418 switch (ptype) { 2419 case MATPRODUCT_AB: 2420 Amat = Acusp->mat; 2421 Bmat = Bcusp->mat; 2422 break; 2423 case MATPRODUCT_AtB: 2424 Amat = Acusp->matTranspose; 2425 Bmat = Bcusp->mat; 2426 break; 2427 case MATPRODUCT_ABt: 2428 Amat = Acusp->mat; 2429 Bmat = Bcusp->matTranspose; 2430 break; 2431 default: 2432 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2433 } 2434 Cmat = Ccusp->mat; 2435 PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2436 PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2437 PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2438 Acsr = (CsrMatrix*)Amat->mat; 2439 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2440 Ccsr = (CsrMatrix*)Cmat->mat; 2441 PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2442 PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2443 PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2444 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2445 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2446 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2447 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2448 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2449 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2450 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2451 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2452 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2453 #else 2454 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2455 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2456 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2457 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2458 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2459 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2460 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2461 #endif 2462 #else 2463 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2464 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2465 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2466 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2467 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2468 #endif 2469 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2470 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2471 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2472 C->offloadmask = PETSC_OFFLOAD_GPU; 2473 finalize: 2474 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2475 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2476 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2477 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 2478 c->reallocs = 0; 2479 C->info.mallocs += 0; 2480 C->info.nz_unneeded = 0; 2481 C->assembled = C->was_assembled = PETSC_TRUE; 2482 C->num_ass++; 2483 PetscFunctionReturn(0); 2484 } 2485 2486 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2487 { 2488 Mat_Product *product = C->product; 2489 Mat A,B; 2490 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2491 Mat_SeqAIJ *a,*b,*c; 2492 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2493 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2494 PetscInt i,j,m,n,k; 2495 PetscBool flg; 2496 PetscErrorCode ierr; 2497 cusparseStatus_t stat; 2498 cudaError_t cerr; 2499 MatProductType ptype; 2500 MatMatCusparse *mmdata; 2501 PetscLogDouble flops; 2502 PetscBool biscompressed,ciscompressed; 2503 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2504 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2505 cusparseSpMatDescr_t BmatSpDescr; 2506 #else 2507 int cnz; 2508 #endif 2509 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2510 2511 PetscFunctionBegin; 2512 MatCheckProduct(C,1); 2513 PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2514 A = product->A; 2515 B = product->B; 2516 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2517 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2518 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2519 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2520 a = (Mat_SeqAIJ*)A->data; 2521 b = (Mat_SeqAIJ*)B->data; 2522 /* product data */ 2523 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2524 C->product->data = mmdata; 2525 C->product->destroy = MatDestroy_MatMatCusparse; 2526 2527 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2528 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2529 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2530 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2531 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2532 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2533 2534 ptype = product->type; 2535 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2536 ptype = MATPRODUCT_AB; 2537 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2538 } 2539 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2540 ptype = MATPRODUCT_AB; 2541 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2542 } 2543 biscompressed = PETSC_FALSE; 2544 ciscompressed = PETSC_FALSE; 2545 switch (ptype) { 2546 case MATPRODUCT_AB: 2547 m = A->rmap->n; 2548 n = B->cmap->n; 2549 k = A->cmap->n; 2550 Amat = Acusp->mat; 2551 Bmat = Bcusp->mat; 2552 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2553 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2554 break; 2555 case MATPRODUCT_AtB: 2556 m = A->cmap->n; 2557 n = B->cmap->n; 2558 k = A->rmap->n; 2559 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2560 Amat = Acusp->matTranspose; 2561 Bmat = Bcusp->mat; 2562 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2563 break; 2564 case MATPRODUCT_ABt: 2565 m = A->rmap->n; 2566 n = B->rmap->n; 2567 k = A->cmap->n; 2568 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2569 Amat = Acusp->mat; 2570 Bmat = Bcusp->matTranspose; 2571 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2572 break; 2573 default: 2574 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2575 } 2576 2577 /* create cusparse matrix */ 2578 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2579 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2580 c = (Mat_SeqAIJ*)C->data; 2581 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2582 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2583 Ccsr = new CsrMatrix; 2584 2585 c->compressedrow.use = ciscompressed; 2586 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2587 c->compressedrow.nrows = a->compressedrow.nrows; 2588 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2589 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2590 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2591 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2592 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2593 } else { 2594 c->compressedrow.nrows = 0; 2595 c->compressedrow.i = NULL; 2596 c->compressedrow.rindex = NULL; 2597 Ccusp->workVector = NULL; 2598 Cmat->cprowIndices = NULL; 2599 } 2600 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2601 Ccusp->mat = Cmat; 2602 Ccusp->mat->mat = Ccsr; 2603 Ccsr->num_rows = Ccusp->nrows; 2604 Ccsr->num_cols = n; 2605 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2606 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2607 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2608 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2609 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2610 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2611 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2612 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2613 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2614 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2615 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2616 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2617 c->nz = 0; 2618 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2619 Ccsr->values = new THRUSTARRAY(c->nz); 2620 goto finalizesym; 2621 } 2622 2623 PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2624 PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2625 Acsr = (CsrMatrix*)Amat->mat; 2626 if (!biscompressed) { 2627 Bcsr = (CsrMatrix*)Bmat->mat; 2628 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2629 BmatSpDescr = Bmat->matDescr; 2630 #endif 2631 } else { /* we need to use row offsets for the full matrix */ 2632 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2633 Bcsr = new CsrMatrix; 2634 Bcsr->num_rows = B->rmap->n; 2635 Bcsr->num_cols = cBcsr->num_cols; 2636 Bcsr->num_entries = cBcsr->num_entries; 2637 Bcsr->column_indices = cBcsr->column_indices; 2638 Bcsr->values = cBcsr->values; 2639 if (!Bcusp->rowoffsets_gpu) { 2640 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2641 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2642 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2643 } 2644 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2645 mmdata->Bcsr = Bcsr; 2646 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2647 if (Bcsr->num_rows && Bcsr->num_cols) { 2648 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2649 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2650 Bcsr->values->data().get(), 2651 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2652 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2653 } 2654 BmatSpDescr = mmdata->matSpBDescr; 2655 #endif 2656 } 2657 PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2658 PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2659 /* precompute flops count */ 2660 if (ptype == MATPRODUCT_AB) { 2661 for (i=0, flops = 0; i<A->rmap->n; i++) { 2662 const PetscInt st = a->i[i]; 2663 const PetscInt en = a->i[i+1]; 2664 for (j=st; j<en; j++) { 2665 const PetscInt brow = a->j[j]; 2666 flops += 2.*(b->i[brow+1] - b->i[brow]); 2667 } 2668 } 2669 } else if (ptype == MATPRODUCT_AtB) { 2670 for (i=0, flops = 0; i<A->rmap->n; i++) { 2671 const PetscInt anzi = a->i[i+1] - a->i[i]; 2672 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2673 flops += (2.*anzi)*bnzi; 2674 } 2675 } else { /* TODO */ 2676 flops = 0.; 2677 } 2678 2679 mmdata->flops = flops; 2680 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2681 2682 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2683 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2684 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2685 NULL, NULL, NULL, 2686 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2687 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2688 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2689 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2690 { 2691 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2692 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2693 */ 2694 void* dBuffer1 = NULL; 2695 void* dBuffer2 = NULL; 2696 void* dBuffer3 = NULL; 2697 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2698 size_t bufferSize1 = 0; 2699 size_t bufferSize2 = 0; 2700 size_t bufferSize3 = 0; 2701 size_t bufferSize4 = 0; 2702 size_t bufferSize5 = 0; 2703 2704 /*----------------------------------------------------------------------*/ 2705 /* ask bufferSize1 bytes for external memory */ 2706 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2707 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2708 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2709 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2710 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2711 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2712 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2713 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2714 2715 /*----------------------------------------------------------------------*/ 2716 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2717 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2718 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2719 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2720 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2721 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2722 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2723 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2724 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2725 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2726 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2727 2728 /*----------------------------------------------------------------------*/ 2729 /* get matrix C non-zero entries C_nnz1 */ 2730 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2731 c->nz = (PetscInt) C_nnz1; 2732 /* allocate matrix C */ 2733 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2734 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2735 /* update matC with the new pointers */ 2736 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2737 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2738 2739 /*----------------------------------------------------------------------*/ 2740 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2741 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2742 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2743 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2744 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2745 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2746 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2747 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2748 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2749 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2750 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2751 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2752 ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2753 } 2754 #else 2755 size_t bufSize2; 2756 /* ask bufferSize bytes for external memory */ 2757 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2758 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2759 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2760 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2761 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2762 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2763 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2764 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2765 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2766 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2767 /* ask bufferSize again bytes for external memory */ 2768 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2769 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2770 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2771 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2772 /* The CUSPARSE documentation is not clear, nor the API 2773 We need both buffers to perform the operations properly! 2774 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2775 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2776 is stored in the descriptor! What a messy API... */ 2777 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2778 /* compute the intermediate product of A * B */ 2779 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2780 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2781 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2782 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2783 /* get matrix C non-zero entries C_nnz1 */ 2784 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2785 c->nz = (PetscInt) C_nnz1; 2786 ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2787 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2788 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2789 Ccsr->values = new THRUSTARRAY(c->nz); 2790 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2791 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2792 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2793 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2794 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2795 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2796 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2797 #else 2798 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2799 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2800 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2801 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2802 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2803 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2804 c->nz = cnz; 2805 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2806 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2807 Ccsr->values = new THRUSTARRAY(c->nz); 2808 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2809 2810 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2811 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2812 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2813 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2814 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2815 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2816 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2817 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2818 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2819 #endif 2820 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2821 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2822 finalizesym: 2823 c->singlemalloc = PETSC_FALSE; 2824 c->free_a = PETSC_TRUE; 2825 c->free_ij = PETSC_TRUE; 2826 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2827 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2828 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2829 PetscInt *d_i = c->i; 2830 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2831 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2832 ii = *Ccsr->row_offsets; 2833 jj = *Ccsr->column_indices; 2834 if (ciscompressed) d_i = c->compressedrow.i; 2835 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2836 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2837 } else { 2838 PetscInt *d_i = c->i; 2839 if (ciscompressed) d_i = c->compressedrow.i; 2840 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2841 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2842 } 2843 if (ciscompressed) { /* need to expand host row offsets */ 2844 PetscInt r = 0; 2845 c->i[0] = 0; 2846 for (k = 0; k < c->compressedrow.nrows; k++) { 2847 const PetscInt next = c->compressedrow.rindex[k]; 2848 const PetscInt old = c->compressedrow.i[k]; 2849 for (; r < next; r++) c->i[r+1] = old; 2850 } 2851 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2852 } 2853 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2854 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2855 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2856 c->maxnz = c->nz; 2857 c->nonzerorowcnt = 0; 2858 c->rmax = 0; 2859 for (k = 0; k < m; k++) { 2860 const PetscInt nn = c->i[k+1] - c->i[k]; 2861 c->ilen[k] = c->imax[k] = nn; 2862 c->nonzerorowcnt += (PetscInt)!!nn; 2863 c->rmax = PetscMax(c->rmax,nn); 2864 } 2865 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2866 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2867 Ccsr->num_entries = c->nz; 2868 2869 C->nonzerostate++; 2870 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2871 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2872 Ccusp->nonzerostate = C->nonzerostate; 2873 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2874 C->preallocated = PETSC_TRUE; 2875 C->assembled = PETSC_FALSE; 2876 C->was_assembled = PETSC_FALSE; 2877 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2878 mmdata->reusesym = PETSC_TRUE; 2879 C->offloadmask = PETSC_OFFLOAD_GPU; 2880 } 2881 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2882 PetscFunctionReturn(0); 2883 } 2884 2885 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2886 2887 /* handles sparse or dense B */ 2888 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2889 { 2890 Mat_Product *product = mat->product; 2891 PetscErrorCode ierr; 2892 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2893 2894 PetscFunctionBegin; 2895 MatCheckProduct(mat,1); 2896 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2897 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2898 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2899 } 2900 if (product->type == MATPRODUCT_ABC) { 2901 Ciscusp = PETSC_FALSE; 2902 if (!product->C->boundtocpu) { 2903 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2904 } 2905 } 2906 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2907 PetscBool usecpu = PETSC_FALSE; 2908 switch (product->type) { 2909 case MATPRODUCT_AB: 2910 if (product->api_user) { 2911 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2912 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2913 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2914 } else { 2915 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2916 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2917 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2918 } 2919 break; 2920 case MATPRODUCT_AtB: 2921 if (product->api_user) { 2922 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2923 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2924 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2925 } else { 2926 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2927 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2928 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2929 } 2930 break; 2931 case MATPRODUCT_PtAP: 2932 if (product->api_user) { 2933 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2934 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2935 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2936 } else { 2937 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2938 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2939 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2940 } 2941 break; 2942 case MATPRODUCT_RARt: 2943 if (product->api_user) { 2944 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2945 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2946 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2947 } else { 2948 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2949 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2950 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2951 } 2952 break; 2953 case MATPRODUCT_ABC: 2954 if (product->api_user) { 2955 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2956 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2957 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2958 } else { 2959 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2960 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2961 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2962 } 2963 break; 2964 default: 2965 break; 2966 } 2967 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2968 } 2969 /* dispatch */ 2970 if (isdense) { 2971 switch (product->type) { 2972 case MATPRODUCT_AB: 2973 case MATPRODUCT_AtB: 2974 case MATPRODUCT_ABt: 2975 case MATPRODUCT_PtAP: 2976 case MATPRODUCT_RARt: 2977 if (product->A->boundtocpu) { 2978 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2979 } else { 2980 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2981 } 2982 break; 2983 case MATPRODUCT_ABC: 2984 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2985 break; 2986 default: 2987 break; 2988 } 2989 } else if (Biscusp && Ciscusp) { 2990 switch (product->type) { 2991 case MATPRODUCT_AB: 2992 case MATPRODUCT_AtB: 2993 case MATPRODUCT_ABt: 2994 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2995 break; 2996 case MATPRODUCT_PtAP: 2997 case MATPRODUCT_RARt: 2998 case MATPRODUCT_ABC: 2999 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3000 break; 3001 default: 3002 break; 3003 } 3004 } else { /* fallback for AIJ */ 3005 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3006 } 3007 PetscFunctionReturn(0); 3008 } 3009 3010 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3011 { 3012 PetscErrorCode ierr; 3013 3014 PetscFunctionBegin; 3015 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3016 PetscFunctionReturn(0); 3017 } 3018 3019 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3020 { 3021 PetscErrorCode ierr; 3022 3023 PetscFunctionBegin; 3024 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3025 PetscFunctionReturn(0); 3026 } 3027 3028 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3029 { 3030 PetscErrorCode ierr; 3031 3032 PetscFunctionBegin; 3033 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3034 PetscFunctionReturn(0); 3035 } 3036 3037 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3038 { 3039 PetscErrorCode ierr; 3040 3041 PetscFunctionBegin; 3042 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3043 PetscFunctionReturn(0); 3044 } 3045 3046 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3047 { 3048 PetscErrorCode ierr; 3049 3050 PetscFunctionBegin; 3051 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3052 PetscFunctionReturn(0); 3053 } 3054 3055 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3056 { 3057 int i = blockIdx.x*blockDim.x + threadIdx.x; 3058 if (i < n) y[idx[i]] += x[i]; 3059 } 3060 3061 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3062 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3063 { 3064 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3065 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3066 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3067 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3068 PetscErrorCode ierr; 3069 cusparseStatus_t stat; 3070 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3071 PetscBool compressed; 3072 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3073 PetscInt nx,ny; 3074 #endif 3075 3076 PetscFunctionBegin; 3077 PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3078 if (!a->nonzerorowcnt) { 3079 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3080 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3081 PetscFunctionReturn(0); 3082 } 3083 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3084 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3085 if (!trans) { 3086 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3087 PetscCheckFalse(!matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3088 } else { 3089 if (herm || !A->form_explicit_transpose) { 3090 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3091 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3092 } else { 3093 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3094 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3095 } 3096 } 3097 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3098 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3099 3100 try { 3101 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3102 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3103 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3104 3105 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3106 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3107 /* z = A x + beta y. 3108 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3109 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3110 */ 3111 xptr = xarray; 3112 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3113 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3114 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3115 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3116 allocated to accommodate different uses. So we get the length info directly from mat. 3117 */ 3118 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3119 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3120 nx = mat->num_cols; 3121 ny = mat->num_rows; 3122 } 3123 #endif 3124 } else { 3125 /* z = A^T x + beta y 3126 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3127 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3128 */ 3129 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3130 dptr = zarray; 3131 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3132 if (compressed) { /* Scatter x to work vector */ 3133 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3134 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3135 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3136 VecCUDAEqualsReverse()); 3137 } 3138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3139 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3140 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3141 nx = mat->num_rows; 3142 ny = mat->num_cols; 3143 } 3144 #endif 3145 } 3146 3147 /* csr_spmv does y = alpha op(A) x + beta y */ 3148 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3149 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3150 PetscCheckFalse(opA < 0 || opA > 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3151 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3152 cudaError_t cerr; 3153 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3154 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3155 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3156 matstruct->matDescr, 3157 matstruct->cuSpMV[opA].vecXDescr, beta, 3158 matstruct->cuSpMV[opA].vecYDescr, 3159 cusparse_scalartype, 3160 cusparsestruct->spmvAlg, 3161 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3162 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3163 3164 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3165 } else { 3166 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3167 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3168 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3169 } 3170 3171 stat = cusparseSpMV(cusparsestruct->handle, opA, 3172 matstruct->alpha_one, 3173 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3174 matstruct->cuSpMV[opA].vecXDescr, 3175 beta, 3176 matstruct->cuSpMV[opA].vecYDescr, 3177 cusparse_scalartype, 3178 cusparsestruct->spmvAlg, 3179 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3180 #else 3181 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3182 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3183 mat->num_rows, mat->num_cols, 3184 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3185 mat->values->data().get(), mat->row_offsets->data().get(), 3186 mat->column_indices->data().get(), xptr, beta, 3187 dptr);CHKERRCUSPARSE(stat); 3188 #endif 3189 } else { 3190 if (cusparsestruct->nrows) { 3191 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3192 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3193 #else 3194 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3195 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3196 matstruct->alpha_one, matstruct->descr, hybMat, 3197 xptr, beta, 3198 dptr);CHKERRCUSPARSE(stat); 3199 #endif 3200 } 3201 } 3202 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3203 3204 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3205 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3206 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3207 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3208 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3209 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3210 } 3211 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3212 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3213 } 3214 3215 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3216 if (compressed) { 3217 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3218 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3219 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3220 prevent that. So I just add a ScatterAdd kernel. 3221 */ 3222 #if 0 3223 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3224 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3225 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3226 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3227 VecCUDAPlusEquals()); 3228 #else 3229 PetscInt n = matstruct->cprowIndices->size(); 3230 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3231 #endif 3232 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3233 } 3234 } else { 3235 if (yy && yy != zz) { 3236 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3237 } 3238 } 3239 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3240 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3241 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3242 } catch(char *ex) { 3243 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3244 } 3245 if (yy) { 3246 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3247 } else { 3248 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3249 } 3250 PetscFunctionReturn(0); 3251 } 3252 3253 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3254 { 3255 PetscErrorCode ierr; 3256 3257 PetscFunctionBegin; 3258 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3259 PetscFunctionReturn(0); 3260 } 3261 3262 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3263 { 3264 PetscErrorCode ierr; 3265 PetscObjectState onnz = A->nonzerostate; 3266 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3267 3268 PetscFunctionBegin; 3269 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3270 if (onnz != A->nonzerostate && cusp->deviceMat) { 3271 cudaError_t cerr; 3272 3273 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3274 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3275 cusp->deviceMat = NULL; 3276 } 3277 PetscFunctionReturn(0); 3278 } 3279 3280 /* --------------------------------------------------------------------------------*/ 3281 /*@ 3282 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3283 (the default parallel PETSc format). This matrix will ultimately pushed down 3284 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3285 assembly performance the user should preallocate the matrix storage by setting 3286 the parameter nz (or the array nnz). By setting these parameters accurately, 3287 performance during matrix assembly can be increased by more than a factor of 50. 3288 3289 Collective 3290 3291 Input Parameters: 3292 + comm - MPI communicator, set to PETSC_COMM_SELF 3293 . m - number of rows 3294 . n - number of columns 3295 . nz - number of nonzeros per row (same for all rows) 3296 - nnz - array containing the number of nonzeros in the various rows 3297 (possibly different for each row) or NULL 3298 3299 Output Parameter: 3300 . A - the matrix 3301 3302 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3303 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3304 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3305 3306 Notes: 3307 If nnz is given then nz is ignored 3308 3309 The AIJ format (also called the Yale sparse matrix format or 3310 compressed row storage), is fully compatible with standard Fortran 77 3311 storage. That is, the stored row and column indices can begin at 3312 either one (as in Fortran) or zero. See the users' manual for details. 3313 3314 Specify the preallocated storage with either nz or nnz (not both). 3315 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3316 allocation. For large problems you MUST preallocate memory or you 3317 will get TERRIBLE performance, see the users' manual chapter on matrices. 3318 3319 By default, this format uses inodes (identical nodes) when possible, to 3320 improve numerical efficiency of matrix-vector products and solves. We 3321 search for consecutive rows with the same nonzero structure, thereby 3322 reusing matrix information to achieve increased efficiency. 3323 3324 Level: intermediate 3325 3326 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3327 @*/ 3328 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3329 { 3330 PetscErrorCode ierr; 3331 3332 PetscFunctionBegin; 3333 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3334 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3335 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3336 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3337 PetscFunctionReturn(0); 3338 } 3339 3340 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3341 { 3342 PetscErrorCode ierr; 3343 3344 PetscFunctionBegin; 3345 if (A->factortype == MAT_FACTOR_NONE) { 3346 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3347 } else { 3348 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3349 } 3350 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3351 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3357 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3358 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3359 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3360 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3361 PetscFunctionReturn(0); 3362 } 3363 3364 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3365 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3366 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3367 { 3368 PetscErrorCode ierr; 3369 3370 PetscFunctionBegin; 3371 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3372 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3373 PetscFunctionReturn(0); 3374 } 3375 3376 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3377 { 3378 PetscErrorCode ierr; 3379 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3380 Mat_SeqAIJCUSPARSE *cy; 3381 Mat_SeqAIJCUSPARSE *cx; 3382 PetscScalar *ay; 3383 const PetscScalar *ax; 3384 CsrMatrix *csry,*csrx; 3385 3386 PetscFunctionBegin; 3387 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3388 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3389 if (X->ops->axpy != Y->ops->axpy) { 3390 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3391 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3392 PetscFunctionReturn(0); 3393 } 3394 /* if we are here, it means both matrices are bound to GPU */ 3395 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3396 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3397 PetscCheckFalse(cy->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3398 PetscCheckFalse(cx->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3399 csry = (CsrMatrix*)cy->mat->mat; 3400 csrx = (CsrMatrix*)cx->mat->mat; 3401 /* see if we can turn this into a cublas axpy */ 3402 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3403 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3404 if (eq) { 3405 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3406 } 3407 if (eq) str = SAME_NONZERO_PATTERN; 3408 } 3409 /* spgeam is buggy with one column */ 3410 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3411 3412 if (str == SUBSET_NONZERO_PATTERN) { 3413 cusparseStatus_t stat; 3414 PetscScalar b = 1.0; 3415 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3416 size_t bufferSize; 3417 void *buffer; 3418 cudaError_t cerr; 3419 #endif 3420 3421 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3422 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3423 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3424 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3425 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3426 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3427 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3428 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3429 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3430 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3431 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3432 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3433 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3434 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3435 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3436 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3437 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3438 #else 3439 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3440 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3441 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3442 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3443 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3444 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3445 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3446 #endif 3447 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3448 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3449 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3450 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3451 } else if (str == SAME_NONZERO_PATTERN) { 3452 cublasHandle_t cublasv2handle; 3453 cublasStatus_t berr; 3454 PetscBLASInt one = 1, bnz = 1; 3455 3456 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3457 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3458 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3459 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3460 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3461 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3462 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3463 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3464 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3465 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3466 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3467 } else { 3468 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3469 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3470 } 3471 PetscFunctionReturn(0); 3472 } 3473 3474 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3475 { 3476 PetscErrorCode ierr; 3477 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3478 PetscScalar *ay; 3479 cublasHandle_t cublasv2handle; 3480 cublasStatus_t berr; 3481 PetscBLASInt one = 1, bnz = 1; 3482 3483 PetscFunctionBegin; 3484 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3485 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3486 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3487 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3488 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3489 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3490 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3491 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3492 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3493 PetscFunctionReturn(0); 3494 } 3495 3496 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3497 { 3498 PetscErrorCode ierr; 3499 PetscBool both = PETSC_FALSE; 3500 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3501 3502 PetscFunctionBegin; 3503 if (A->factortype == MAT_FACTOR_NONE) { 3504 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3505 if (spptr->mat) { 3506 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3507 if (matrix->values) { 3508 both = PETSC_TRUE; 3509 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3510 } 3511 } 3512 if (spptr->matTranspose) { 3513 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3514 if (matrix->values) { 3515 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3516 } 3517 } 3518 } 3519 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3520 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3521 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3522 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3523 else A->offloadmask = PETSC_OFFLOAD_CPU; 3524 PetscFunctionReturn(0); 3525 } 3526 3527 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3528 { 3529 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3530 PetscErrorCode ierr; 3531 3532 PetscFunctionBegin; 3533 if (A->factortype != MAT_FACTOR_NONE) { 3534 A->boundtocpu = flg; 3535 PetscFunctionReturn(0); 3536 } 3537 if (flg) { 3538 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3539 3540 A->ops->scale = MatScale_SeqAIJ; 3541 A->ops->axpy = MatAXPY_SeqAIJ; 3542 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3543 A->ops->mult = MatMult_SeqAIJ; 3544 A->ops->multadd = MatMultAdd_SeqAIJ; 3545 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3546 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3547 A->ops->multhermitiantranspose = NULL; 3548 A->ops->multhermitiantransposeadd = NULL; 3549 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3550 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3553 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3554 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3555 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3556 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3557 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3558 } else { 3559 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3560 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3561 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3562 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3563 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3564 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3565 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3566 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3567 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3568 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3569 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3570 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3571 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3572 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3573 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3574 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3575 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3576 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3577 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3579 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3580 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3581 } 3582 A->boundtocpu = flg; 3583 if (flg && a->inode.size) { 3584 a->inode.use = PETSC_TRUE; 3585 } else { 3586 a->inode.use = PETSC_FALSE; 3587 } 3588 PetscFunctionReturn(0); 3589 } 3590 3591 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3592 { 3593 PetscErrorCode ierr; 3594 cusparseStatus_t stat; 3595 Mat B; 3596 3597 PetscFunctionBegin; 3598 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3599 if (reuse == MAT_INITIAL_MATRIX) { 3600 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3601 } else if (reuse == MAT_REUSE_MATRIX) { 3602 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3603 } 3604 B = *newmat; 3605 3606 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3607 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3608 3609 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3610 if (B->factortype == MAT_FACTOR_NONE) { 3611 Mat_SeqAIJCUSPARSE *spptr; 3612 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3613 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3614 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3615 spptr->format = MAT_CUSPARSE_CSR; 3616 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3617 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3618 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3619 #else 3620 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3621 #endif 3622 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3623 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3624 #endif 3625 B->spptr = spptr; 3626 } else { 3627 Mat_SeqAIJCUSPARSETriFactors *spptr; 3628 3629 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3630 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3631 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3632 B->spptr = spptr; 3633 } 3634 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3635 } 3636 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3637 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3638 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3639 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3640 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3641 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3642 3643 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3644 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3645 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3646 #if defined(PETSC_HAVE_HYPRE) 3647 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3648 #endif 3649 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 3650 PetscFunctionReturn(0); 3651 } 3652 3653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3654 { 3655 PetscErrorCode ierr; 3656 3657 PetscFunctionBegin; 3658 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3659 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3660 PetscFunctionReturn(0); 3661 } 3662 3663 /*MC 3664 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3665 3666 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3667 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3668 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3669 3670 Options Database Keys: 3671 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3672 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3673 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3674 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3675 3676 Level: beginner 3677 3678 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3679 M*/ 3680 3681 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3682 3683 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3684 { 3685 PetscErrorCode ierr; 3686 3687 PetscFunctionBegin; 3688 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3689 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3690 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3691 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3692 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3693 3694 PetscFunctionReturn(0); 3695 } 3696 3697 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3698 { 3699 PetscErrorCode ierr; 3700 cusparseStatus_t stat; 3701 cudaError_t cerr; 3702 3703 PetscFunctionBegin; 3704 if (*cusparsestruct) { 3705 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3706 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3707 delete (*cusparsestruct)->workVector; 3708 delete (*cusparsestruct)->rowoffsets_gpu; 3709 delete (*cusparsestruct)->cooPerm; 3710 delete (*cusparsestruct)->cooPerm_a; 3711 delete (*cusparsestruct)->csr2csc_i; 3712 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3713 if ((*cusparsestruct)->use_extended_coo) { 3714 cerr = cudaFree((*cusparsestruct)->jmap_d);CHKERRCUDA(cerr); 3715 cerr = cudaFree((*cusparsestruct)->perm_d);CHKERRCUDA(cerr); 3716 } 3717 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3718 } 3719 PetscFunctionReturn(0); 3720 } 3721 3722 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3723 { 3724 PetscFunctionBegin; 3725 if (*mat) { 3726 delete (*mat)->values; 3727 delete (*mat)->column_indices; 3728 delete (*mat)->row_offsets; 3729 delete *mat; 3730 *mat = 0; 3731 } 3732 PetscFunctionReturn(0); 3733 } 3734 3735 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3736 { 3737 cusparseStatus_t stat; 3738 PetscErrorCode ierr; 3739 3740 PetscFunctionBegin; 3741 if (*trifactor) { 3742 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3743 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3744 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3745 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3746 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3747 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3748 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3749 #endif 3750 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3751 } 3752 PetscFunctionReturn(0); 3753 } 3754 3755 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3756 { 3757 CsrMatrix *mat; 3758 cusparseStatus_t stat; 3759 cudaError_t err; 3760 3761 PetscFunctionBegin; 3762 if (*matstruct) { 3763 if ((*matstruct)->mat) { 3764 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3765 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3766 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3767 #else 3768 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3769 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3770 #endif 3771 } else { 3772 mat = (CsrMatrix*)(*matstruct)->mat; 3773 CsrMatrix_Destroy(&mat); 3774 } 3775 } 3776 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3777 delete (*matstruct)->cprowIndices; 3778 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3779 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3780 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3781 3782 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3783 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3784 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3785 for (int i=0; i<3; i++) { 3786 if (mdata->cuSpMV[i].initialized) { 3787 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3788 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3789 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3790 } 3791 } 3792 #endif 3793 delete *matstruct; 3794 *matstruct = NULL; 3795 } 3796 PetscFunctionReturn(0); 3797 } 3798 3799 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3800 { 3801 PetscErrorCode ierr; 3802 3803 PetscFunctionBegin; 3804 if (*trifactors) { 3805 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3806 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3807 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3808 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3809 delete (*trifactors)->rpermIndices; 3810 delete (*trifactors)->cpermIndices; 3811 delete (*trifactors)->workVector; 3812 (*trifactors)->rpermIndices = NULL; 3813 (*trifactors)->cpermIndices = NULL; 3814 (*trifactors)->workVector = NULL; 3815 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3816 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3817 (*trifactors)->init_dev_prop = PETSC_FALSE; 3818 } 3819 PetscFunctionReturn(0); 3820 } 3821 3822 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3823 { 3824 PetscErrorCode ierr; 3825 cusparseHandle_t handle; 3826 cusparseStatus_t stat; 3827 3828 PetscFunctionBegin; 3829 if (*trifactors) { 3830 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3831 if (handle = (*trifactors)->handle) { 3832 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3833 } 3834 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3835 } 3836 PetscFunctionReturn(0); 3837 } 3838 3839 struct IJCompare 3840 { 3841 __host__ __device__ 3842 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3843 { 3844 if (t1.get<0>() < t2.get<0>()) return true; 3845 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3846 return false; 3847 } 3848 }; 3849 3850 struct IJEqual 3851 { 3852 __host__ __device__ 3853 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3854 { 3855 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3856 return true; 3857 } 3858 }; 3859 3860 struct IJDiff 3861 { 3862 __host__ __device__ 3863 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3864 { 3865 return t1 == t2 ? 0 : 1; 3866 } 3867 }; 3868 3869 struct IJSum 3870 { 3871 __host__ __device__ 3872 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3873 { 3874 return t1||t2; 3875 } 3876 }; 3877 3878 #include <thrust/iterator/discard_iterator.h> 3879 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3880 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3881 { 3882 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3883 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3884 THRUSTARRAY *cooPerm_v = NULL; 3885 thrust::device_ptr<const PetscScalar> d_v; 3886 CsrMatrix *matrix; 3887 PetscErrorCode ierr; 3888 PetscInt n; 3889 3890 PetscFunctionBegin; 3891 PetscCheckFalse(!cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3892 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3893 if (!cusp->cooPerm) { 3894 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3895 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3896 PetscFunctionReturn(0); 3897 } 3898 matrix = (CsrMatrix*)cusp->mat->mat; 3899 PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3900 if (!v) { 3901 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3902 goto finalize; 3903 } 3904 n = cusp->cooPerm->size(); 3905 if (isCudaMem(v)) { 3906 d_v = thrust::device_pointer_cast(v); 3907 } else { 3908 cooPerm_v = new THRUSTARRAY(n); 3909 cooPerm_v->assign(v,v+n); 3910 d_v = cooPerm_v->data(); 3911 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3912 } 3913 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3914 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3915 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3916 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3917 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3918 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3919 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3920 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3921 */ 3922 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3923 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3924 delete cooPerm_w; 3925 } else { 3926 /* all nonzeros in d_v[] are unique entries */ 3927 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3928 matrix->values->begin())); 3929 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3930 matrix->values->end())); 3931 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3932 } 3933 } else { 3934 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3935 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3936 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3937 } else { 3938 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3939 matrix->values->begin())); 3940 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3941 matrix->values->end())); 3942 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3943 } 3944 } 3945 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3946 finalize: 3947 delete cooPerm_v; 3948 A->offloadmask = PETSC_OFFLOAD_GPU; 3949 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3950 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3951 ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3952 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3953 ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr); 3954 a->reallocs = 0; 3955 A->info.mallocs += 0; 3956 A->info.nz_unneeded = 0; 3957 A->assembled = A->was_assembled = PETSC_TRUE; 3958 A->num_ass++; 3959 PetscFunctionReturn(0); 3960 } 3961 3962 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3963 { 3964 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3965 PetscErrorCode ierr; 3966 3967 PetscFunctionBegin; 3968 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3969 if (!cusp) PetscFunctionReturn(0); 3970 if (destroy) { 3971 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3972 delete cusp->csr2csc_i; 3973 cusp->csr2csc_i = NULL; 3974 } 3975 A->transupdated = PETSC_FALSE; 3976 PetscFunctionReturn(0); 3977 } 3978 3979 #include <thrust/binary_search.h> 3980 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3981 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3982 { 3983 PetscErrorCode ierr; 3984 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3985 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3986 PetscInt cooPerm_n, nzr = 0; 3987 cudaError_t cerr; 3988 3989 PetscFunctionBegin; 3990 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3991 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3992 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3993 if (n != cooPerm_n) { 3994 delete cusp->cooPerm; 3995 delete cusp->cooPerm_a; 3996 cusp->cooPerm = NULL; 3997 cusp->cooPerm_a = NULL; 3998 } 3999 if (n) { 4000 THRUSTINTARRAY d_i(n); 4001 THRUSTINTARRAY d_j(n); 4002 THRUSTINTARRAY ii(A->rmap->n); 4003 4004 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 4005 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 4006 4007 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 4008 d_i.assign(coo_i,coo_i+n); 4009 d_j.assign(coo_j,coo_j+n); 4010 4011 /* Ex. 4012 n = 6 4013 coo_i = [3,3,1,4,1,4] 4014 coo_j = [3,2,2,5,2,6] 4015 */ 4016 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4017 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4018 4019 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4020 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4021 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4022 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4023 THRUSTINTARRAY w = d_j; 4024 4025 /* 4026 d_i = [1,1,3,3,4,4] 4027 d_j = [2,2,2,3,5,6] 4028 cooPerm = [2,4,1,0,3,5] 4029 */ 4030 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4031 4032 /* 4033 d_i = [1,3,3,4,4,x] 4034 ^ekey 4035 d_j = [2,2,3,5,6,x] 4036 ^nekye 4037 */ 4038 if (nekey == ekey) { /* all entries are unique */ 4039 delete cusp->cooPerm_a; 4040 cusp->cooPerm_a = NULL; 4041 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4042 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4043 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4044 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4045 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4046 w[0] = 0; 4047 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4048 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4049 } 4050 thrust::counting_iterator<PetscInt> search_begin(0); 4051 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4052 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4053 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4054 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4055 4056 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 4057 a->singlemalloc = PETSC_FALSE; 4058 a->free_a = PETSC_TRUE; 4059 a->free_ij = PETSC_TRUE; 4060 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4061 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4062 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4063 a->nz = a->maxnz = a->i[A->rmap->n]; 4064 a->rmax = 0; 4065 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4066 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4067 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4068 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4069 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4070 for (PetscInt i = 0; i < A->rmap->n; i++) { 4071 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4072 nzr += (PetscInt)!!(nnzr); 4073 a->ilen[i] = a->imax[i] = nnzr; 4074 a->rmax = PetscMax(a->rmax,nnzr); 4075 } 4076 a->nonzerorowcnt = nzr; 4077 A->preallocated = PETSC_TRUE; 4078 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4079 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4080 } else { 4081 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4082 } 4083 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4084 4085 /* We want to allocate the CUSPARSE struct for matvec now. 4086 The code is so convoluted now that I prefer to copy zeros */ 4087 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4088 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4089 A->offloadmask = PETSC_OFFLOAD_CPU; 4090 A->nonzerostate++; 4091 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4092 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4093 4094 A->assembled = PETSC_FALSE; 4095 A->was_assembled = PETSC_FALSE; 4096 PetscFunctionReturn(0); 4097 } 4098 4099 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4100 { 4101 PetscErrorCode ierr; 4102 cudaError_t cerr; 4103 Mat_SeqAIJ *seq; 4104 Mat_SeqAIJCUSPARSE *dev; 4105 Mat newmat; 4106 PetscInt coo_basic = 1; 4107 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4108 4109 PetscFunctionBegin; 4110 if (coo_i) { 4111 ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 4112 if (PetscMemTypeHost(mtype)) { 4113 for (PetscCount k=0; k<coo_n; k++) { 4114 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = 0; break;} 4115 } 4116 } 4117 } 4118 4119 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4120 ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 4121 } else { 4122 ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 4123 ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 4124 ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr); 4125 ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 4126 ierr = MatConvert(newmat,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 4127 ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 4128 ierr = MatSeqAIJCUSPARSECopyToGPU(mat);CHKERRQ(ierr); 4129 ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 4130 4131 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4132 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4133 cerr = cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 4134 cerr = cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4135 cerr = cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount));CHKERRCUDA(cerr); 4136 cerr = cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4137 dev->use_extended_coo = PETSC_TRUE; 4138 } 4139 PetscFunctionReturn(0); 4140 } 4141 4142 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4143 { 4144 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4145 const PetscCount grid_size = gridDim.x * blockDim.x; 4146 for (; i<nnz; i+= grid_size) { 4147 PetscScalar sum = 0.0; 4148 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4149 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4150 } 4151 } 4152 4153 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4154 { 4155 PetscErrorCode ierr; 4156 cudaError_t cerr; 4157 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4158 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4159 PetscCount Annz = seq->nz; 4160 PetscMemType memtype; 4161 const PetscScalar *v1 = v; 4162 PetscScalar *Aa; 4163 4164 PetscFunctionBegin; 4165 if (dev->use_extended_coo) { 4166 ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 4167 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4168 cerr = cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 4169 cerr = cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4170 } 4171 4172 if (imode == INSERT_VALUES) {ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr);} 4173 else {ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr);} 4174 4175 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4176 4177 if (imode == INSERT_VALUES) {ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);} 4178 else {ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);} 4179 4180 if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 4181 } else { 4182 ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode);CHKERRQ(ierr); 4183 } 4184 PetscFunctionReturn(0); 4185 } 4186 4187 /*@C 4188 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4189 4190 Not collective 4191 4192 Input Parameters: 4193 + A - the matrix 4194 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4195 4196 Output Parameters: 4197 + ia - the CSR row pointers 4198 - ja - the CSR column indices 4199 4200 Level: developer 4201 4202 Notes: 4203 When compressed is true, the CSR structure does not contain empty rows 4204 4205 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4206 @*/ 4207 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4208 { 4209 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4210 CsrMatrix *csr; 4211 PetscErrorCode ierr; 4212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4213 4214 PetscFunctionBegin; 4215 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4216 if (!i || !j) PetscFunctionReturn(0); 4217 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4218 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4219 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4220 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4221 csr = (CsrMatrix*)cusp->mat->mat; 4222 if (i) { 4223 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4224 if (!cusp->rowoffsets_gpu) { 4225 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4226 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4227 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4228 } 4229 *i = cusp->rowoffsets_gpu->data().get(); 4230 } else *i = csr->row_offsets->data().get(); 4231 } 4232 if (j) *j = csr->column_indices->data().get(); 4233 PetscFunctionReturn(0); 4234 } 4235 4236 /*@C 4237 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4238 4239 Not collective 4240 4241 Input Parameters: 4242 + A - the matrix 4243 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4244 4245 Output Parameters: 4246 + ia - the CSR row pointers 4247 - ja - the CSR column indices 4248 4249 Level: developer 4250 4251 .seealso: MatSeqAIJCUSPARSEGetIJ() 4252 @*/ 4253 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4254 { 4255 PetscFunctionBegin; 4256 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4257 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4258 if (i) *i = NULL; 4259 if (j) *j = NULL; 4260 PetscFunctionReturn(0); 4261 } 4262 4263 /*@C 4264 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4265 4266 Not Collective 4267 4268 Input Parameter: 4269 . A - a MATSEQAIJCUSPARSE matrix 4270 4271 Output Parameter: 4272 . a - pointer to the device data 4273 4274 Level: developer 4275 4276 Notes: may trigger host-device copies if up-to-date matrix data is on host 4277 4278 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4279 @*/ 4280 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4281 { 4282 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4283 CsrMatrix *csr; 4284 PetscErrorCode ierr; 4285 4286 PetscFunctionBegin; 4287 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4288 PetscValidPointer(a,2); 4289 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4290 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4291 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4292 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4293 csr = (CsrMatrix*)cusp->mat->mat; 4294 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4295 *a = csr->values->data().get(); 4296 PetscFunctionReturn(0); 4297 } 4298 4299 /*@C 4300 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4301 4302 Not Collective 4303 4304 Input Parameter: 4305 . A - a MATSEQAIJCUSPARSE matrix 4306 4307 Output Parameter: 4308 . a - pointer to the device data 4309 4310 Level: developer 4311 4312 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4313 @*/ 4314 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4315 { 4316 PetscFunctionBegin; 4317 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4318 PetscValidPointer(a,2); 4319 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4320 *a = NULL; 4321 PetscFunctionReturn(0); 4322 } 4323 4324 /*@C 4325 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4326 4327 Not Collective 4328 4329 Input Parameter: 4330 . A - a MATSEQAIJCUSPARSE matrix 4331 4332 Output Parameter: 4333 . a - pointer to the device data 4334 4335 Level: developer 4336 4337 Notes: may trigger host-device copies if up-to-date matrix data is on host 4338 4339 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4340 @*/ 4341 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4342 { 4343 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4344 CsrMatrix *csr; 4345 PetscErrorCode ierr; 4346 4347 PetscFunctionBegin; 4348 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4349 PetscValidPointer(a,2); 4350 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4351 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4352 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4353 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4354 csr = (CsrMatrix*)cusp->mat->mat; 4355 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4356 *a = csr->values->data().get(); 4357 A->offloadmask = PETSC_OFFLOAD_GPU; 4358 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4359 PetscFunctionReturn(0); 4360 } 4361 /*@C 4362 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4363 4364 Not Collective 4365 4366 Input Parameter: 4367 . A - a MATSEQAIJCUSPARSE matrix 4368 4369 Output Parameter: 4370 . a - pointer to the device data 4371 4372 Level: developer 4373 4374 .seealso: MatSeqAIJCUSPARSEGetArray() 4375 @*/ 4376 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4377 { 4378 PetscErrorCode ierr; 4379 4380 PetscFunctionBegin; 4381 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4382 PetscValidPointer(a,2); 4383 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4384 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 4385 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4386 *a = NULL; 4387 PetscFunctionReturn(0); 4388 } 4389 4390 /*@C 4391 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4392 4393 Not Collective 4394 4395 Input Parameter: 4396 . A - a MATSEQAIJCUSPARSE matrix 4397 4398 Output Parameter: 4399 . a - pointer to the device data 4400 4401 Level: developer 4402 4403 Notes: does not trigger host-device copies and flags data validity on the GPU 4404 4405 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4406 @*/ 4407 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4408 { 4409 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4410 CsrMatrix *csr; 4411 PetscErrorCode ierr; 4412 4413 PetscFunctionBegin; 4414 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4415 PetscValidPointer(a,2); 4416 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4417 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4418 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4419 csr = (CsrMatrix*)cusp->mat->mat; 4420 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4421 *a = csr->values->data().get(); 4422 A->offloadmask = PETSC_OFFLOAD_GPU; 4423 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4424 PetscFunctionReturn(0); 4425 } 4426 4427 /*@C 4428 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4429 4430 Not Collective 4431 4432 Input Parameter: 4433 . A - a MATSEQAIJCUSPARSE matrix 4434 4435 Output Parameter: 4436 . a - pointer to the device data 4437 4438 Level: developer 4439 4440 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4441 @*/ 4442 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4443 { 4444 PetscErrorCode ierr; 4445 4446 PetscFunctionBegin; 4447 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4448 PetscValidPointer(a,2); 4449 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4450 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 4451 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4452 *a = NULL; 4453 PetscFunctionReturn(0); 4454 } 4455 4456 struct IJCompare4 4457 { 4458 __host__ __device__ 4459 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4460 { 4461 if (t1.get<0>() < t2.get<0>()) return true; 4462 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4463 return false; 4464 } 4465 }; 4466 4467 struct Shift 4468 { 4469 int _shift; 4470 4471 Shift(int shift) : _shift(shift) {} 4472 __host__ __device__ 4473 inline int operator() (const int &c) 4474 { 4475 return c + _shift; 4476 } 4477 }; 4478 4479 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4480 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4481 { 4482 PetscErrorCode ierr; 4483 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4484 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4485 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4486 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4487 PetscInt Annz,Bnnz; 4488 cusparseStatus_t stat; 4489 PetscInt i,m,n,zero = 0; 4490 cudaError_t cerr; 4491 4492 PetscFunctionBegin; 4493 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4494 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4495 PetscValidPointer(C,4); 4496 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4497 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4498 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4499 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4500 PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4501 PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4502 if (reuse == MAT_INITIAL_MATRIX) { 4503 m = A->rmap->n; 4504 n = A->cmap->n + B->cmap->n; 4505 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4506 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4507 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4508 c = (Mat_SeqAIJ*)(*C)->data; 4509 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4510 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4511 Ccsr = new CsrMatrix; 4512 Cmat->cprowIndices = NULL; 4513 c->compressedrow.use = PETSC_FALSE; 4514 c->compressedrow.nrows = 0; 4515 c->compressedrow.i = NULL; 4516 c->compressedrow.rindex = NULL; 4517 Ccusp->workVector = NULL; 4518 Ccusp->nrows = m; 4519 Ccusp->mat = Cmat; 4520 Ccusp->mat->mat = Ccsr; 4521 Ccsr->num_rows = m; 4522 Ccsr->num_cols = n; 4523 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4524 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4525 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4526 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4527 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4528 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4529 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4530 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4531 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4532 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4533 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4534 PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4535 PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4536 4537 Acsr = (CsrMatrix*)Acusp->mat->mat; 4538 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4539 Annz = (PetscInt)Acsr->column_indices->size(); 4540 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4541 c->nz = Annz + Bnnz; 4542 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4543 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4544 Ccsr->values = new THRUSTARRAY(c->nz); 4545 Ccsr->num_entries = c->nz; 4546 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4547 if (c->nz) { 4548 auto Acoo = new THRUSTINTARRAY32(Annz); 4549 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4550 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4551 THRUSTINTARRAY32 *Aroff,*Broff; 4552 4553 if (a->compressedrow.use) { /* need full row offset */ 4554 if (!Acusp->rowoffsets_gpu) { 4555 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4556 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4557 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4558 } 4559 Aroff = Acusp->rowoffsets_gpu; 4560 } else Aroff = Acsr->row_offsets; 4561 if (b->compressedrow.use) { /* need full row offset */ 4562 if (!Bcusp->rowoffsets_gpu) { 4563 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4564 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4565 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4566 } 4567 Broff = Bcusp->rowoffsets_gpu; 4568 } else Broff = Bcsr->row_offsets; 4569 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4570 stat = cusparseXcsr2coo(Acusp->handle, 4571 Aroff->data().get(), 4572 Annz, 4573 m, 4574 Acoo->data().get(), 4575 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4576 stat = cusparseXcsr2coo(Bcusp->handle, 4577 Broff->data().get(), 4578 Bnnz, 4579 m, 4580 Bcoo->data().get(), 4581 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4582 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4583 auto Aperm = thrust::make_constant_iterator(1); 4584 auto Bperm = thrust::make_constant_iterator(0); 4585 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4586 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4587 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4588 #else 4589 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4590 auto Bcib = Bcsr->column_indices->begin(); 4591 auto Bcie = Bcsr->column_indices->end(); 4592 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4593 #endif 4594 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4595 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4596 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4597 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4598 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4599 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4600 auto p1 = Ccusp->cooPerm->begin(); 4601 auto p2 = Ccusp->cooPerm->begin(); 4602 thrust::advance(p2,Annz); 4603 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4604 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4605 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4606 #endif 4607 auto cci = thrust::make_counting_iterator(zero); 4608 auto cce = thrust::make_counting_iterator(c->nz); 4609 #if 0 //Errors on SUMMIT cuda 11.1.0 4610 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4611 #else 4612 auto pred = thrust::identity<int>(); 4613 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4614 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4615 #endif 4616 stat = cusparseXcoo2csr(Ccusp->handle, 4617 Ccoo->data().get(), 4618 c->nz, 4619 m, 4620 Ccsr->row_offsets->data().get(), 4621 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4622 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4623 delete wPerm; 4624 delete Acoo; 4625 delete Bcoo; 4626 delete Ccoo; 4627 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4628 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4629 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4630 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4631 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4632 #endif 4633 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4634 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4635 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4636 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4637 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4638 CsrMatrix *CcsrT = new CsrMatrix; 4639 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4640 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4641 4642 (*C)->form_explicit_transpose = PETSC_TRUE; 4643 (*C)->transupdated = PETSC_TRUE; 4644 Ccusp->rowoffsets_gpu = NULL; 4645 CmatT->cprowIndices = NULL; 4646 CmatT->mat = CcsrT; 4647 CcsrT->num_rows = n; 4648 CcsrT->num_cols = m; 4649 CcsrT->num_entries = c->nz; 4650 4651 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4652 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4653 CcsrT->values = new THRUSTARRAY(c->nz); 4654 4655 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4656 auto rT = CcsrT->row_offsets->begin(); 4657 if (AT) { 4658 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4659 thrust::advance(rT,-1); 4660 } 4661 if (BT) { 4662 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4663 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4664 thrust::copy(titb,tite,rT); 4665 } 4666 auto cT = CcsrT->column_indices->begin(); 4667 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4668 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4669 auto vT = CcsrT->values->begin(); 4670 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4671 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4672 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4673 4674 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4675 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4676 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4677 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4678 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4679 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4680 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4681 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4682 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4683 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4684 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4685 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4686 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4687 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4688 #endif 4689 Ccusp->matTranspose = CmatT; 4690 } 4691 } 4692 4693 c->singlemalloc = PETSC_FALSE; 4694 c->free_a = PETSC_TRUE; 4695 c->free_ij = PETSC_TRUE; 4696 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4697 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4698 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4699 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4700 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4701 ii = *Ccsr->row_offsets; 4702 jj = *Ccsr->column_indices; 4703 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4704 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4705 } else { 4706 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4707 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4708 } 4709 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4710 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4711 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4712 c->maxnz = c->nz; 4713 c->nonzerorowcnt = 0; 4714 c->rmax = 0; 4715 for (i = 0; i < m; i++) { 4716 const PetscInt nn = c->i[i+1] - c->i[i]; 4717 c->ilen[i] = c->imax[i] = nn; 4718 c->nonzerorowcnt += (PetscInt)!!nn; 4719 c->rmax = PetscMax(c->rmax,nn); 4720 } 4721 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4722 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4723 (*C)->nonzerostate++; 4724 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4725 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4726 Ccusp->nonzerostate = (*C)->nonzerostate; 4727 (*C)->preallocated = PETSC_TRUE; 4728 } else { 4729 PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4730 c = (Mat_SeqAIJ*)(*C)->data; 4731 if (c->nz) { 4732 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4733 PetscCheckFalse(!Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4734 PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4735 PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4736 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4737 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4738 PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4739 PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4740 Acsr = (CsrMatrix*)Acusp->mat->mat; 4741 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4742 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4743 PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4744 PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4745 PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4746 PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4747 PetscCheckFalse(Ccusp->cooPerm->size() != Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4748 auto pmid = Ccusp->cooPerm->begin(); 4749 thrust::advance(pmid,Acsr->num_entries); 4750 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4751 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4752 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4753 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4754 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4755 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4756 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4757 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4758 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4759 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4760 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4761 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4762 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4763 PetscCheckFalse(!Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4764 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4765 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4766 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4767 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4768 auto vT = CcsrT->values->begin(); 4769 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4770 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4771 (*C)->transupdated = PETSC_TRUE; 4772 } 4773 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4774 } 4775 } 4776 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4777 (*C)->assembled = PETSC_TRUE; 4778 (*C)->was_assembled = PETSC_FALSE; 4779 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4780 PetscFunctionReturn(0); 4781 } 4782 4783 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4784 { 4785 PetscErrorCode ierr; 4786 bool dmem; 4787 const PetscScalar *av; 4788 cudaError_t cerr; 4789 4790 PetscFunctionBegin; 4791 dmem = isCudaMem(v); 4792 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4793 if (n && idx) { 4794 THRUSTINTARRAY widx(n); 4795 widx.assign(idx,idx+n); 4796 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4797 4798 THRUSTARRAY *w = NULL; 4799 thrust::device_ptr<PetscScalar> dv; 4800 if (dmem) { 4801 dv = thrust::device_pointer_cast(v); 4802 } else { 4803 w = new THRUSTARRAY(n); 4804 dv = w->data(); 4805 } 4806 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4807 4808 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4809 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4810 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4811 if (w) { 4812 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4813 } 4814 delete w; 4815 } else { 4816 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4817 } 4818 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4819 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4820 PetscFunctionReturn(0); 4821 } 4822