1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 91 92 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 93 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 96 97 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 98 { 99 cusparseStatus_t stat; 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101 102 PetscFunctionBegin; 103 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 104 cusparsestruct->stream = stream; 105 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 110 { 111 cusparseStatus_t stat; 112 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 113 114 PetscFunctionBegin; 115 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 116 if (cusparsestruct->handle != handle) { 117 if (cusparsestruct->handle) { 118 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 119 } 120 cusparsestruct->handle = handle; 121 } 122 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 123 PetscFunctionReturn(0); 124 } 125 126 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 127 { 128 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 129 PetscBool flg; 130 PetscErrorCode ierr; 131 132 PetscFunctionBegin; 133 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 134 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 135 if (cusparsestruct->handle) cusparsestruct->handle = 0; 136 PetscFunctionReturn(0); 137 } 138 139 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 140 { 141 PetscFunctionBegin; 142 *type = MATSOLVERCUSPARSE; 143 PetscFunctionReturn(0); 144 } 145 146 /*MC 147 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 148 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 149 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 150 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 151 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 152 algorithms are not recommended. This class does NOT support direct solver operations. 153 154 Level: beginner 155 156 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 157 M*/ 158 159 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 160 { 161 PetscErrorCode ierr; 162 PetscInt n = A->rmap->n; 163 164 PetscFunctionBegin; 165 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 166 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 167 (*B)->factortype = ftype; 168 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 169 170 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 171 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 172 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 173 if (!A->boundtocpu) { 174 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 175 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 176 } else { 177 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 178 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 179 } 180 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 181 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 182 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 183 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 184 if (!A->boundtocpu) { 185 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 186 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 187 } else { 188 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 189 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 190 } 191 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 192 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 193 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 194 195 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 196 (*B)->canuseordering = PETSC_TRUE; 197 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 198 PetscFunctionReturn(0); 199 } 200 201 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 202 { 203 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 204 205 PetscFunctionBegin; 206 switch (op) { 207 case MAT_CUSPARSE_MULT: 208 cusparsestruct->format = format; 209 break; 210 case MAT_CUSPARSE_ALL: 211 cusparsestruct->format = format; 212 break; 213 default: 214 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 215 } 216 PetscFunctionReturn(0); 217 } 218 219 /*@ 220 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 221 operation. Only the MatMult operation can use different GPU storage formats 222 for MPIAIJCUSPARSE matrices. 223 Not Collective 224 225 Input Parameters: 226 + A - Matrix of type SEQAIJCUSPARSE 227 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 228 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 229 230 Output Parameter: 231 232 Level: intermediate 233 234 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 235 @*/ 236 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 237 { 238 PetscErrorCode ierr; 239 240 PetscFunctionBegin; 241 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 242 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 247 { 248 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 249 250 PetscFunctionBegin; 251 cusparsestruct->use_cpu_solve = use_cpu; 252 PetscFunctionReturn(0); 253 } 254 255 /*@ 256 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 257 258 Input Parameters: 259 + A - Matrix of type SEQAIJCUSPARSE 260 - use_cpu - set flag for using the built-in CPU MatSolve 261 262 Output Parameter: 263 264 Notes: 265 The cuSparse LU solver currently computes the factors with the built-in CPU method 266 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 267 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 268 269 Level: intermediate 270 271 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 272 @*/ 273 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 274 { 275 PetscErrorCode ierr; 276 277 PetscFunctionBegin; 278 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 279 ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 280 PetscFunctionReturn(0); 281 } 282 283 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 284 { 285 PetscErrorCode ierr; 286 287 PetscFunctionBegin; 288 switch (op) { 289 case MAT_FORM_EXPLICIT_TRANSPOSE: 290 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 291 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 292 A->form_explicit_transpose = flg; 293 break; 294 default: 295 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 296 break; 297 } 298 PetscFunctionReturn(0); 299 } 300 301 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 302 303 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 304 { 305 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 306 IS isrow = b->row,iscol = b->col; 307 PetscBool row_identity,col_identity; 308 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 309 PetscErrorCode ierr; 310 311 PetscFunctionBegin; 312 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 313 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 314 B->offloadmask = PETSC_OFFLOAD_CPU; 315 /* determine which version of MatSolve needs to be used. */ 316 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 317 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 318 if (row_identity && col_identity) { 319 if (!cusparsestruct->use_cpu_solve) { 320 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 321 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 322 } 323 B->ops->matsolve = NULL; 324 B->ops->matsolvetranspose = NULL; 325 } else { 326 if (!cusparsestruct->use_cpu_solve) { 327 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 328 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 329 } 330 B->ops->matsolve = NULL; 331 B->ops->matsolvetranspose = NULL; 332 } 333 334 /* get the triangular factors */ 335 if (!cusparsestruct->use_cpu_solve) { 336 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 337 } 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 342 { 343 PetscErrorCode ierr; 344 MatCUSPARSEStorageFormat format; 345 PetscBool flg; 346 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 347 348 PetscFunctionBegin; 349 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 350 if (A->factortype == MAT_FACTOR_NONE) { 351 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 352 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 353 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 354 355 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 356 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 357 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 358 ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 359 if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 360 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 361 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 362 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 363 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 364 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 365 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 366 #else 367 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 368 #endif 369 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 370 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 371 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 372 373 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 374 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 375 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 376 #endif 377 } 378 ierr = PetscOptionsTail();CHKERRQ(ierr); 379 PetscFunctionReturn(0); 380 } 381 382 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 383 { 384 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 385 PetscErrorCode ierr; 386 387 PetscFunctionBegin; 388 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 389 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 390 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 391 PetscFunctionReturn(0); 392 } 393 394 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 395 { 396 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 397 PetscErrorCode ierr; 398 399 PetscFunctionBegin; 400 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 401 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 402 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 403 PetscFunctionReturn(0); 404 } 405 406 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 407 { 408 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 409 PetscErrorCode ierr; 410 411 PetscFunctionBegin; 412 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 413 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 414 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 415 PetscFunctionReturn(0); 416 } 417 418 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 419 { 420 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 421 PetscErrorCode ierr; 422 423 PetscFunctionBegin; 424 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 425 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 426 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 427 PetscFunctionReturn(0); 428 } 429 430 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 431 { 432 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 433 PetscInt n = A->rmap->n; 434 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 435 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 436 cusparseStatus_t stat; 437 const PetscInt *ai = a->i,*aj = a->j,*vi; 438 const MatScalar *aa = a->a,*v; 439 PetscInt *AiLo, *AjLo; 440 PetscInt i,nz, nzLower, offset, rowOffset; 441 PetscErrorCode ierr; 442 cudaError_t cerr; 443 444 PetscFunctionBegin; 445 if (!n) PetscFunctionReturn(0); 446 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 447 try { 448 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 449 nzLower=n+ai[n]-ai[1]; 450 if (!loTriFactor) { 451 PetscScalar *AALo; 452 453 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 454 455 /* Allocate Space for the lower triangular matrix */ 456 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 457 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 458 459 /* Fill the lower triangular matrix */ 460 AiLo[0] = (PetscInt) 0; 461 AiLo[n] = nzLower; 462 AjLo[0] = (PetscInt) 0; 463 AALo[0] = (MatScalar) 1.0; 464 v = aa; 465 vi = aj; 466 offset = 1; 467 rowOffset= 1; 468 for (i=1; i<n; i++) { 469 nz = ai[i+1] - ai[i]; 470 /* additional 1 for the term on the diagonal */ 471 AiLo[i] = rowOffset; 472 rowOffset += nz+1; 473 474 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 475 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 476 477 offset += nz; 478 AjLo[offset] = (PetscInt) i; 479 AALo[offset] = (MatScalar) 1.0; 480 offset += 1; 481 482 v += nz; 483 vi += nz; 484 } 485 486 /* allocate space for the triangular factor information */ 487 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 488 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 489 /* Create the matrix description */ 490 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 491 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 492 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 493 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 494 #else 495 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 496 #endif 497 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 498 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 499 500 /* set the operation */ 501 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 502 503 /* set the matrix */ 504 loTriFactor->csrMat = new CsrMatrix; 505 loTriFactor->csrMat->num_rows = n; 506 loTriFactor->csrMat->num_cols = n; 507 loTriFactor->csrMat->num_entries = nzLower; 508 509 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 510 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 511 512 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 513 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 514 515 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 516 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 517 518 /* Create the solve analysis information */ 519 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 520 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 521 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 522 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 523 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 524 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 525 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 526 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 527 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 528 #endif 529 530 /* perform the solve analysis */ 531 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 532 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 533 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 534 loTriFactor->csrMat->column_indices->data().get(), 535 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 536 loTriFactor->solveInfo, 537 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 538 #else 539 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 540 #endif 541 cerr = WaitForCUDA();CHKERRCUDA(cerr); 542 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 543 544 /* assign the pointer */ 545 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 546 loTriFactor->AA_h = AALo; 547 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 548 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 549 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 550 } else { /* update values only */ 551 if (!loTriFactor->AA_h) { 552 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 553 } 554 /* Fill the lower triangular matrix */ 555 loTriFactor->AA_h[0] = 1.0; 556 v = aa; 557 vi = aj; 558 offset = 1; 559 for (i=1; i<n; i++) { 560 nz = ai[i+1] - ai[i]; 561 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 562 offset += nz; 563 loTriFactor->AA_h[offset] = 1.0; 564 offset += 1; 565 v += nz; 566 } 567 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 568 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 569 } 570 } catch(char *ex) { 571 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 572 } 573 } 574 PetscFunctionReturn(0); 575 } 576 577 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 578 { 579 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 580 PetscInt n = A->rmap->n; 581 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 582 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 583 cusparseStatus_t stat; 584 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 585 const MatScalar *aa = a->a,*v; 586 PetscInt *AiUp, *AjUp; 587 PetscInt i,nz, nzUpper, offset; 588 PetscErrorCode ierr; 589 cudaError_t cerr; 590 591 PetscFunctionBegin; 592 if (!n) PetscFunctionReturn(0); 593 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 594 try { 595 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 596 nzUpper = adiag[0]-adiag[n]; 597 if (!upTriFactor) { 598 PetscScalar *AAUp; 599 600 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 601 602 /* Allocate Space for the upper triangular matrix */ 603 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 604 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 605 606 /* Fill the upper triangular matrix */ 607 AiUp[0]=(PetscInt) 0; 608 AiUp[n]=nzUpper; 609 offset = nzUpper; 610 for (i=n-1; i>=0; i--) { 611 v = aa + adiag[i+1] + 1; 612 vi = aj + adiag[i+1] + 1; 613 614 /* number of elements NOT on the diagonal */ 615 nz = adiag[i] - adiag[i+1]-1; 616 617 /* decrement the offset */ 618 offset -= (nz+1); 619 620 /* first, set the diagonal elements */ 621 AjUp[offset] = (PetscInt) i; 622 AAUp[offset] = (MatScalar)1./v[nz]; 623 AiUp[i] = AiUp[i+1] - (nz+1); 624 625 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 626 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 627 } 628 629 /* allocate space for the triangular factor information */ 630 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 631 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 632 633 /* Create the matrix description */ 634 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 635 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 636 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 637 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 638 #else 639 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 640 #endif 641 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 642 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 643 644 /* set the operation */ 645 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 646 647 /* set the matrix */ 648 upTriFactor->csrMat = new CsrMatrix; 649 upTriFactor->csrMat->num_rows = n; 650 upTriFactor->csrMat->num_cols = n; 651 upTriFactor->csrMat->num_entries = nzUpper; 652 653 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 654 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 655 656 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 657 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 658 659 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 660 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 661 662 /* Create the solve analysis information */ 663 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 664 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 665 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 666 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 667 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 668 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 669 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 670 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 671 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 672 #endif 673 674 /* perform the solve analysis */ 675 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 676 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 677 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 678 upTriFactor->csrMat->column_indices->data().get(), 679 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 680 upTriFactor->solveInfo, 681 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 682 #else 683 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 684 #endif 685 cerr = WaitForCUDA();CHKERRCUDA(cerr); 686 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 687 688 /* assign the pointer */ 689 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 690 upTriFactor->AA_h = AAUp; 691 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 692 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 693 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 694 } else { 695 if (!upTriFactor->AA_h) { 696 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 697 } 698 /* Fill the upper triangular matrix */ 699 offset = nzUpper; 700 for (i=n-1; i>=0; i--) { 701 v = aa + adiag[i+1] + 1; 702 703 /* number of elements NOT on the diagonal */ 704 nz = adiag[i] - adiag[i+1]-1; 705 706 /* decrement the offset */ 707 offset -= (nz+1); 708 709 /* first, set the diagonal elements */ 710 upTriFactor->AA_h[offset] = 1./v[nz]; 711 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 712 } 713 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 714 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 715 } 716 } catch(char *ex) { 717 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 718 } 719 } 720 PetscFunctionReturn(0); 721 } 722 723 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 724 { 725 PetscErrorCode ierr; 726 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 727 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 728 IS isrow = a->row,iscol = a->icol; 729 PetscBool row_identity,col_identity; 730 PetscInt n = A->rmap->n; 731 732 PetscFunctionBegin; 733 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 734 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 735 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 736 737 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 738 cusparseTriFactors->nnz=a->nz; 739 740 A->offloadmask = PETSC_OFFLOAD_BOTH; 741 /* lower triangular indices */ 742 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 743 if (!row_identity && !cusparseTriFactors->rpermIndices) { 744 const PetscInt *r; 745 746 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 747 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 748 cusparseTriFactors->rpermIndices->assign(r, r+n); 749 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 750 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 751 } 752 753 /* upper triangular indices */ 754 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 755 if (!col_identity && !cusparseTriFactors->cpermIndices) { 756 const PetscInt *c; 757 758 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 759 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 760 cusparseTriFactors->cpermIndices->assign(c, c+n); 761 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 762 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 763 } 764 PetscFunctionReturn(0); 765 } 766 767 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 768 { 769 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 770 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 771 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 772 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 773 cusparseStatus_t stat; 774 PetscErrorCode ierr; 775 cudaError_t cerr; 776 PetscInt *AiUp, *AjUp; 777 PetscScalar *AAUp; 778 PetscScalar *AALo; 779 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 780 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 781 const PetscInt *ai = b->i,*aj = b->j,*vj; 782 const MatScalar *aa = b->a,*v; 783 784 PetscFunctionBegin; 785 if (!n) PetscFunctionReturn(0); 786 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 787 try { 788 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 789 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 790 if (!upTriFactor && !loTriFactor) { 791 /* Allocate Space for the upper triangular matrix */ 792 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 793 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 794 795 /* Fill the upper triangular matrix */ 796 AiUp[0]=(PetscInt) 0; 797 AiUp[n]=nzUpper; 798 offset = 0; 799 for (i=0; i<n; i++) { 800 /* set the pointers */ 801 v = aa + ai[i]; 802 vj = aj + ai[i]; 803 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 804 805 /* first, set the diagonal elements */ 806 AjUp[offset] = (PetscInt) i; 807 AAUp[offset] = (MatScalar)1.0/v[nz]; 808 AiUp[i] = offset; 809 AALo[offset] = (MatScalar)1.0/v[nz]; 810 811 offset+=1; 812 if (nz>0) { 813 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 814 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 815 for (j=offset; j<offset+nz; j++) { 816 AAUp[j] = -AAUp[j]; 817 AALo[j] = AAUp[j]/v[nz]; 818 } 819 offset+=nz; 820 } 821 } 822 823 /* allocate space for the triangular factor information */ 824 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 825 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 826 827 /* Create the matrix description */ 828 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 829 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 830 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 831 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 832 #else 833 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 834 #endif 835 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 836 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 837 838 /* set the matrix */ 839 upTriFactor->csrMat = new CsrMatrix; 840 upTriFactor->csrMat->num_rows = A->rmap->n; 841 upTriFactor->csrMat->num_cols = A->cmap->n; 842 upTriFactor->csrMat->num_entries = a->nz; 843 844 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846 847 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849 850 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 852 853 /* set the operation */ 854 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 855 856 /* Create the solve analysis information */ 857 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 858 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 859 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 860 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 861 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 862 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 863 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 864 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 865 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 866 #endif 867 868 /* perform the solve analysis */ 869 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 870 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 871 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 872 upTriFactor->csrMat->column_indices->data().get(), 873 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 874 upTriFactor->solveInfo, 875 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 876 #else 877 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 878 #endif 879 cerr = WaitForCUDA();CHKERRCUDA(cerr); 880 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 881 882 /* assign the pointer */ 883 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 884 885 /* allocate space for the triangular factor information */ 886 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 887 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 888 889 /* Create the matrix description */ 890 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 891 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 892 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 893 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 894 #else 895 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 896 #endif 897 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 898 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 899 900 /* set the operation */ 901 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 902 903 /* set the matrix */ 904 loTriFactor->csrMat = new CsrMatrix; 905 loTriFactor->csrMat->num_rows = A->rmap->n; 906 loTriFactor->csrMat->num_cols = A->cmap->n; 907 loTriFactor->csrMat->num_entries = a->nz; 908 909 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 910 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 911 912 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 913 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 914 915 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 916 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 917 918 /* Create the solve analysis information */ 919 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 920 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 921 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 922 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 923 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 924 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 925 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 926 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 927 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 928 #endif 929 930 /* perform the solve analysis */ 931 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 932 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 933 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 934 loTriFactor->csrMat->column_indices->data().get(), 935 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 936 loTriFactor->solveInfo, 937 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 938 #else 939 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 940 #endif 941 cerr = WaitForCUDA();CHKERRCUDA(cerr); 942 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 943 944 /* assign the pointer */ 945 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 946 947 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 948 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 949 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 950 } else { 951 /* Fill the upper triangular matrix */ 952 offset = 0; 953 for (i=0; i<n; i++) { 954 /* set the pointers */ 955 v = aa + ai[i]; 956 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 957 958 /* first, set the diagonal elements */ 959 AAUp[offset] = 1.0/v[nz]; 960 AALo[offset] = 1.0/v[nz]; 961 962 offset+=1; 963 if (nz>0) { 964 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 965 for (j=offset; j<offset+nz; j++) { 966 AAUp[j] = -AAUp[j]; 967 AALo[j] = AAUp[j]/v[nz]; 968 } 969 offset+=nz; 970 } 971 } 972 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 973 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 974 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 975 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 976 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 977 } 978 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 979 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 980 } catch(char *ex) { 981 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 982 } 983 } 984 PetscFunctionReturn(0); 985 } 986 987 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 988 { 989 PetscErrorCode ierr; 990 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 991 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 992 IS ip = a->row; 993 PetscBool perm_identity; 994 PetscInt n = A->rmap->n; 995 996 PetscFunctionBegin; 997 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 998 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 999 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 1000 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 1001 1002 A->offloadmask = PETSC_OFFLOAD_BOTH; 1003 1004 /* lower triangular indices */ 1005 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1006 if (!perm_identity) { 1007 IS iip; 1008 const PetscInt *irip,*rip; 1009 1010 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 1011 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1012 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1013 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1014 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1015 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1016 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 1017 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 1018 ierr = ISDestroy(&iip);CHKERRQ(ierr); 1019 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1020 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1021 } 1022 PetscFunctionReturn(0); 1023 } 1024 1025 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1026 { 1027 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1028 IS ip = b->row; 1029 PetscBool perm_identity; 1030 PetscErrorCode ierr; 1031 1032 PetscFunctionBegin; 1033 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1034 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1035 B->offloadmask = PETSC_OFFLOAD_CPU; 1036 /* determine which version of MatSolve needs to be used. */ 1037 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1038 if (perm_identity) { 1039 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1040 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1041 B->ops->matsolve = NULL; 1042 B->ops->matsolvetranspose = NULL; 1043 } else { 1044 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1045 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1046 B->ops->matsolve = NULL; 1047 B->ops->matsolvetranspose = NULL; 1048 } 1049 1050 /* get the triangular factors */ 1051 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1052 PetscFunctionReturn(0); 1053 } 1054 1055 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1056 { 1057 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1058 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1059 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1060 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1061 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1062 cusparseStatus_t stat; 1063 cusparseIndexBase_t indexBase; 1064 cusparseMatrixType_t matrixType; 1065 cusparseFillMode_t fillMode; 1066 cusparseDiagType_t diagType; 1067 cudaError_t cerr; 1068 PetscErrorCode ierr; 1069 1070 PetscFunctionBegin; 1071 /* allocate space for the transpose of the lower triangular factor */ 1072 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1073 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1074 1075 /* set the matrix descriptors of the lower triangular factor */ 1076 matrixType = cusparseGetMatType(loTriFactor->descr); 1077 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1078 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1079 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1080 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1081 1082 /* Create the matrix description */ 1083 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1084 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1085 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1086 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1087 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1088 1089 /* set the operation */ 1090 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1091 1092 /* allocate GPU space for the CSC of the lower triangular factor*/ 1093 loTriFactorT->csrMat = new CsrMatrix; 1094 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1095 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1096 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1097 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1098 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1099 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1100 1101 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1102 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1103 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1104 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1105 loTriFactor->csrMat->values->data().get(), 1106 loTriFactor->csrMat->row_offsets->data().get(), 1107 loTriFactor->csrMat->column_indices->data().get(), 1108 loTriFactorT->csrMat->values->data().get(), 1109 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1110 CUSPARSE_ACTION_NUMERIC,indexBase, 1111 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1112 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1113 #endif 1114 1115 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1116 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1117 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1118 loTriFactor->csrMat->values->data().get(), 1119 loTriFactor->csrMat->row_offsets->data().get(), 1120 loTriFactor->csrMat->column_indices->data().get(), 1121 loTriFactorT->csrMat->values->data().get(), 1122 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1123 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1124 CUSPARSE_ACTION_NUMERIC, indexBase, 1125 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1126 #else 1127 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1128 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1129 #endif 1130 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1131 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1132 1133 /* Create the solve analysis information */ 1134 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1135 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1136 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1137 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1138 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1139 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1140 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1141 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1142 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1143 #endif 1144 1145 /* perform the solve analysis */ 1146 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1147 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1148 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1149 loTriFactorT->csrMat->column_indices->data().get(), 1150 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1151 loTriFactorT->solveInfo, 1152 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1153 #else 1154 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1155 #endif 1156 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1157 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1158 1159 /* assign the pointer */ 1160 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1161 1162 /*********************************************/ 1163 /* Now the Transpose of the Upper Tri Factor */ 1164 /*********************************************/ 1165 1166 /* allocate space for the transpose of the upper triangular factor */ 1167 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1168 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1169 1170 /* set the matrix descriptors of the upper triangular factor */ 1171 matrixType = cusparseGetMatType(upTriFactor->descr); 1172 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1173 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1174 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1175 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1176 1177 /* Create the matrix description */ 1178 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1179 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1180 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1181 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1182 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1183 1184 /* set the operation */ 1185 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1186 1187 /* allocate GPU space for the CSC of the upper triangular factor*/ 1188 upTriFactorT->csrMat = new CsrMatrix; 1189 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1190 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1191 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1192 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1193 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1194 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1195 1196 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1197 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1198 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1199 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1200 upTriFactor->csrMat->values->data().get(), 1201 upTriFactor->csrMat->row_offsets->data().get(), 1202 upTriFactor->csrMat->column_indices->data().get(), 1203 upTriFactorT->csrMat->values->data().get(), 1204 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1205 CUSPARSE_ACTION_NUMERIC,indexBase, 1206 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1207 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1208 #endif 1209 1210 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1211 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1212 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1213 upTriFactor->csrMat->values->data().get(), 1214 upTriFactor->csrMat->row_offsets->data().get(), 1215 upTriFactor->csrMat->column_indices->data().get(), 1216 upTriFactorT->csrMat->values->data().get(), 1217 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1218 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1219 CUSPARSE_ACTION_NUMERIC, indexBase, 1220 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1221 #else 1222 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1223 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1224 #endif 1225 1226 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1227 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1228 1229 /* Create the solve analysis information */ 1230 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1231 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1232 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1233 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1234 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1235 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1236 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1237 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1238 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1239 #endif 1240 1241 /* perform the solve analysis */ 1242 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1243 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1244 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1245 upTriFactorT->csrMat->column_indices->data().get(), 1246 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1247 upTriFactorT->solveInfo, 1248 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1249 #else 1250 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1251 #endif 1252 1253 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1254 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1255 1256 /* assign the pointer */ 1257 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1258 PetscFunctionReturn(0); 1259 } 1260 1261 struct PetscScalarToPetscInt 1262 { 1263 __host__ __device__ 1264 PetscInt operator()(PetscScalar s) 1265 { 1266 return (PetscInt)PetscRealPart(s); 1267 } 1268 }; 1269 1270 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1271 { 1272 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1273 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1274 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1275 cusparseStatus_t stat; 1276 cusparseIndexBase_t indexBase; 1277 cudaError_t err; 1278 PetscErrorCode ierr; 1279 1280 PetscFunctionBegin; 1281 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1282 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1283 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1284 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1285 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1286 if (A->transupdated) PetscFunctionReturn(0); 1287 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1288 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1289 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1290 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1291 } 1292 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1293 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1294 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1295 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1296 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1297 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1298 1299 /* set alpha and beta */ 1300 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1301 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1302 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1303 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1304 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1305 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1306 1307 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1308 CsrMatrix *matrixT = new CsrMatrix; 1309 matstructT->mat = matrixT; 1310 matrixT->num_rows = A->cmap->n; 1311 matrixT->num_cols = A->rmap->n; 1312 matrixT->num_entries = a->nz; 1313 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1314 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1315 matrixT->values = new THRUSTARRAY(a->nz); 1316 1317 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1318 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1319 1320 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1321 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1322 stat = cusparseCreateCsr(&matstructT->matDescr, 1323 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1324 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1325 matrixT->values->data().get(), 1326 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1327 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1328 #else 1329 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1330 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1331 1332 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1333 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1334 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1335 */ 1336 if (matrixT->num_entries) { 1337 stat = cusparseCreateCsr(&matstructT->matDescr, 1338 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1339 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1340 matrixT->values->data().get(), 1341 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1342 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1343 1344 } else { 1345 matstructT->matDescr = NULL; 1346 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1347 } 1348 #endif 1349 #endif 1350 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1351 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1352 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1353 #else 1354 CsrMatrix *temp = new CsrMatrix; 1355 CsrMatrix *tempT = new CsrMatrix; 1356 /* First convert HYB to CSR */ 1357 temp->num_rows = A->rmap->n; 1358 temp->num_cols = A->cmap->n; 1359 temp->num_entries = a->nz; 1360 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1361 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1362 temp->values = new THRUSTARRAY(a->nz); 1363 1364 stat = cusparse_hyb2csr(cusparsestruct->handle, 1365 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1366 temp->values->data().get(), 1367 temp->row_offsets->data().get(), 1368 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1369 1370 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1371 tempT->num_rows = A->rmap->n; 1372 tempT->num_cols = A->cmap->n; 1373 tempT->num_entries = a->nz; 1374 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1375 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1376 tempT->values = new THRUSTARRAY(a->nz); 1377 1378 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1379 temp->num_cols, temp->num_entries, 1380 temp->values->data().get(), 1381 temp->row_offsets->data().get(), 1382 temp->column_indices->data().get(), 1383 tempT->values->data().get(), 1384 tempT->column_indices->data().get(), 1385 tempT->row_offsets->data().get(), 1386 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1387 1388 /* Last, convert CSC to HYB */ 1389 cusparseHybMat_t hybMat; 1390 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1391 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1392 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1393 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1394 matstructT->descr, tempT->values->data().get(), 1395 tempT->row_offsets->data().get(), 1396 tempT->column_indices->data().get(), 1397 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1398 1399 /* assign the pointer */ 1400 matstructT->mat = hybMat; 1401 A->transupdated = PETSC_TRUE; 1402 /* delete temporaries */ 1403 if (tempT) { 1404 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1405 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1406 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1407 delete (CsrMatrix*) tempT; 1408 } 1409 if (temp) { 1410 if (temp->values) delete (THRUSTARRAY*) temp->values; 1411 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1412 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1413 delete (CsrMatrix*) temp; 1414 } 1415 #endif 1416 } 1417 } 1418 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1419 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1420 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1421 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1422 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1423 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1424 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1425 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1426 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1427 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1428 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1429 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1430 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1431 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1432 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1433 } 1434 if (!cusparsestruct->csr2csc_i) { 1435 THRUSTARRAY csr2csc_a(matrix->num_entries); 1436 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1437 1438 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1440 void *csr2cscBuffer; 1441 size_t csr2cscBufferSize; 1442 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1443 A->cmap->n, matrix->num_entries, 1444 matrix->values->data().get(), 1445 cusparsestruct->rowoffsets_gpu->data().get(), 1446 matrix->column_indices->data().get(), 1447 matrixT->values->data().get(), 1448 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1449 CUSPARSE_ACTION_NUMERIC,indexBase, 1450 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1451 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1452 #endif 1453 1454 if (matrix->num_entries) { 1455 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1456 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1457 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1458 1459 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1460 should be filled with indexBase. So I just take a shortcut here. 1461 */ 1462 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1463 A->cmap->n,matrix->num_entries, 1464 csr2csc_a.data().get(), 1465 cusparsestruct->rowoffsets_gpu->data().get(), 1466 matrix->column_indices->data().get(), 1467 matrixT->values->data().get(), 1468 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1469 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1470 CUSPARSE_ACTION_NUMERIC,indexBase, 1471 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1472 #else 1473 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1474 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1475 #endif 1476 } else { 1477 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1478 } 1479 1480 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1481 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1482 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1483 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1484 #endif 1485 } 1486 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1487 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1488 matrixT->values->begin())); 1489 } 1490 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1491 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1492 /* the compressed row indices is not used for matTranspose */ 1493 matstructT->cprowIndices = NULL; 1494 /* assign the pointer */ 1495 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1496 A->transupdated = PETSC_TRUE; 1497 PetscFunctionReturn(0); 1498 } 1499 1500 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1501 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1502 { 1503 PetscInt n = xx->map->n; 1504 const PetscScalar *barray; 1505 PetscScalar *xarray; 1506 thrust::device_ptr<const PetscScalar> bGPU; 1507 thrust::device_ptr<PetscScalar> xGPU; 1508 cusparseStatus_t stat; 1509 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1510 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1511 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1512 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1513 PetscErrorCode ierr; 1514 1515 PetscFunctionBegin; 1516 /* Analyze the matrix and create the transpose ... on the fly */ 1517 if (!loTriFactorT && !upTriFactorT) { 1518 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1519 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1520 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1521 } 1522 1523 /* Get the GPU pointers */ 1524 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1525 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1526 xGPU = thrust::device_pointer_cast(xarray); 1527 bGPU = thrust::device_pointer_cast(barray); 1528 1529 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1530 /* First, reorder with the row permutation */ 1531 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1532 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1533 xGPU); 1534 1535 /* First, solve U */ 1536 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1537 upTriFactorT->csrMat->num_rows, 1538 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1539 upTriFactorT->csrMat->num_entries, 1540 #endif 1541 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1542 upTriFactorT->csrMat->values->data().get(), 1543 upTriFactorT->csrMat->row_offsets->data().get(), 1544 upTriFactorT->csrMat->column_indices->data().get(), 1545 upTriFactorT->solveInfo, 1546 xarray, 1547 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1548 tempGPU->data().get(), 1549 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1550 #else 1551 tempGPU->data().get());CHKERRCUSPARSE(stat); 1552 #endif 1553 1554 /* Then, solve L */ 1555 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1556 loTriFactorT->csrMat->num_rows, 1557 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1558 loTriFactorT->csrMat->num_entries, 1559 #endif 1560 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1561 loTriFactorT->csrMat->values->data().get(), 1562 loTriFactorT->csrMat->row_offsets->data().get(), 1563 loTriFactorT->csrMat->column_indices->data().get(), 1564 loTriFactorT->solveInfo, 1565 tempGPU->data().get(), 1566 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1567 xarray, 1568 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1569 #else 1570 xarray);CHKERRCUSPARSE(stat); 1571 #endif 1572 1573 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1574 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1575 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1576 tempGPU->begin()); 1577 1578 /* Copy the temporary to the full solution. */ 1579 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1580 1581 /* restore */ 1582 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1583 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1584 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1585 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1586 PetscFunctionReturn(0); 1587 } 1588 1589 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1590 { 1591 const PetscScalar *barray; 1592 PetscScalar *xarray; 1593 cusparseStatus_t stat; 1594 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1595 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1596 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1597 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1598 PetscErrorCode ierr; 1599 1600 PetscFunctionBegin; 1601 /* Analyze the matrix and create the transpose ... on the fly */ 1602 if (!loTriFactorT && !upTriFactorT) { 1603 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1604 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1605 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1606 } 1607 1608 /* Get the GPU pointers */ 1609 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1610 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1611 1612 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1613 /* First, solve U */ 1614 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1615 upTriFactorT->csrMat->num_rows, 1616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1617 upTriFactorT->csrMat->num_entries, 1618 #endif 1619 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1620 upTriFactorT->csrMat->values->data().get(), 1621 upTriFactorT->csrMat->row_offsets->data().get(), 1622 upTriFactorT->csrMat->column_indices->data().get(), 1623 upTriFactorT->solveInfo, 1624 barray, 1625 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1626 tempGPU->data().get(), 1627 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1628 #else 1629 tempGPU->data().get());CHKERRCUSPARSE(stat); 1630 #endif 1631 1632 /* Then, solve L */ 1633 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1634 loTriFactorT->csrMat->num_rows, 1635 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1636 loTriFactorT->csrMat->num_entries, 1637 #endif 1638 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1639 loTriFactorT->csrMat->values->data().get(), 1640 loTriFactorT->csrMat->row_offsets->data().get(), 1641 loTriFactorT->csrMat->column_indices->data().get(), 1642 loTriFactorT->solveInfo, 1643 tempGPU->data().get(), 1644 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1645 xarray, 1646 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1647 #else 1648 xarray);CHKERRCUSPARSE(stat); 1649 #endif 1650 1651 /* restore */ 1652 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1653 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1654 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1655 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1660 { 1661 const PetscScalar *barray; 1662 PetscScalar *xarray; 1663 thrust::device_ptr<const PetscScalar> bGPU; 1664 thrust::device_ptr<PetscScalar> xGPU; 1665 cusparseStatus_t stat; 1666 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1667 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1668 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1669 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1670 PetscErrorCode ierr; 1671 1672 PetscFunctionBegin; 1673 1674 /* Get the GPU pointers */ 1675 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1676 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1677 xGPU = thrust::device_pointer_cast(xarray); 1678 bGPU = thrust::device_pointer_cast(barray); 1679 1680 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1681 /* First, reorder with the row permutation */ 1682 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1683 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1684 tempGPU->begin()); 1685 1686 /* Next, solve L */ 1687 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1688 loTriFactor->csrMat->num_rows, 1689 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1690 loTriFactor->csrMat->num_entries, 1691 #endif 1692 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1693 loTriFactor->csrMat->values->data().get(), 1694 loTriFactor->csrMat->row_offsets->data().get(), 1695 loTriFactor->csrMat->column_indices->data().get(), 1696 loTriFactor->solveInfo, 1697 tempGPU->data().get(), 1698 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1699 xarray, 1700 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1701 #else 1702 xarray);CHKERRCUSPARSE(stat); 1703 #endif 1704 1705 /* Then, solve U */ 1706 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1707 upTriFactor->csrMat->num_rows, 1708 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1709 upTriFactor->csrMat->num_entries, 1710 #endif 1711 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1712 upTriFactor->csrMat->values->data().get(), 1713 upTriFactor->csrMat->row_offsets->data().get(), 1714 upTriFactor->csrMat->column_indices->data().get(), 1715 upTriFactor->solveInfo,xarray, 1716 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1717 tempGPU->data().get(), 1718 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1719 #else 1720 tempGPU->data().get());CHKERRCUSPARSE(stat); 1721 #endif 1722 1723 /* Last, reorder with the column permutation */ 1724 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1725 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1726 xGPU); 1727 1728 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1729 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1730 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1731 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1732 PetscFunctionReturn(0); 1733 } 1734 1735 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1736 { 1737 const PetscScalar *barray; 1738 PetscScalar *xarray; 1739 cusparseStatus_t stat; 1740 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1741 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1742 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1743 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1744 PetscErrorCode ierr; 1745 1746 PetscFunctionBegin; 1747 /* Get the GPU pointers */ 1748 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1749 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1750 1751 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1752 /* First, solve L */ 1753 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1754 loTriFactor->csrMat->num_rows, 1755 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1756 loTriFactor->csrMat->num_entries, 1757 #endif 1758 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1759 loTriFactor->csrMat->values->data().get(), 1760 loTriFactor->csrMat->row_offsets->data().get(), 1761 loTriFactor->csrMat->column_indices->data().get(), 1762 loTriFactor->solveInfo, 1763 barray, 1764 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1765 tempGPU->data().get(), 1766 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1767 #else 1768 tempGPU->data().get());CHKERRCUSPARSE(stat); 1769 #endif 1770 1771 /* Next, solve U */ 1772 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1773 upTriFactor->csrMat->num_rows, 1774 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1775 upTriFactor->csrMat->num_entries, 1776 #endif 1777 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1778 upTriFactor->csrMat->values->data().get(), 1779 upTriFactor->csrMat->row_offsets->data().get(), 1780 upTriFactor->csrMat->column_indices->data().get(), 1781 upTriFactor->solveInfo, 1782 tempGPU->data().get(), 1783 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1784 xarray, 1785 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1786 #else 1787 xarray);CHKERRCUSPARSE(stat); 1788 #endif 1789 1790 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1791 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1792 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1793 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1794 PetscFunctionReturn(0); 1795 } 1796 1797 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1798 { 1799 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1800 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1801 cudaError_t cerr; 1802 PetscErrorCode ierr; 1803 1804 PetscFunctionBegin; 1805 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1806 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1807 1808 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1809 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1810 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1811 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1812 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1813 A->offloadmask = PETSC_OFFLOAD_BOTH; 1814 } 1815 PetscFunctionReturn(0); 1816 } 1817 1818 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1819 { 1820 PetscErrorCode ierr; 1821 1822 PetscFunctionBegin; 1823 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1824 *array = ((Mat_SeqAIJ*)A->data)->a; 1825 PetscFunctionReturn(0); 1826 } 1827 1828 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1829 { 1830 PetscFunctionBegin; 1831 A->offloadmask = PETSC_OFFLOAD_CPU; 1832 *array = NULL; 1833 PetscFunctionReturn(0); 1834 } 1835 1836 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1837 { 1838 PetscErrorCode ierr; 1839 1840 PetscFunctionBegin; 1841 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1842 *array = ((Mat_SeqAIJ*)A->data)->a; 1843 PetscFunctionReturn(0); 1844 } 1845 1846 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1847 { 1848 PetscFunctionBegin; 1849 *array = NULL; 1850 PetscFunctionReturn(0); 1851 } 1852 1853 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1854 { 1855 PetscFunctionBegin; 1856 *array = ((Mat_SeqAIJ*)A->data)->a; 1857 PetscFunctionReturn(0); 1858 } 1859 1860 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1861 { 1862 PetscFunctionBegin; 1863 A->offloadmask = PETSC_OFFLOAD_CPU; 1864 *array = NULL; 1865 PetscFunctionReturn(0); 1866 } 1867 1868 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1869 { 1870 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1871 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1872 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1873 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1874 PetscErrorCode ierr; 1875 cusparseStatus_t stat; 1876 PetscBool both = PETSC_TRUE; 1877 cudaError_t err; 1878 1879 PetscFunctionBegin; 1880 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1881 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1882 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1883 CsrMatrix *matrix; 1884 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1885 1886 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1887 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1888 matrix->values->assign(a->a, a->a+a->nz); 1889 err = WaitForCUDA();CHKERRCUDA(err); 1890 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1891 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1892 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1893 } else { 1894 PetscInt nnz; 1895 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1896 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1897 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1898 delete cusparsestruct->workVector; 1899 delete cusparsestruct->rowoffsets_gpu; 1900 cusparsestruct->workVector = NULL; 1901 cusparsestruct->rowoffsets_gpu = NULL; 1902 try { 1903 if (a->compressedrow.use) { 1904 m = a->compressedrow.nrows; 1905 ii = a->compressedrow.i; 1906 ridx = a->compressedrow.rindex; 1907 } else { 1908 m = A->rmap->n; 1909 ii = a->i; 1910 ridx = NULL; 1911 } 1912 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1913 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1914 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1915 else nnz = a->nz; 1916 1917 /* create cusparse matrix */ 1918 cusparsestruct->nrows = m; 1919 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1920 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1921 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1922 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1923 1924 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1925 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1926 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1927 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1928 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1929 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1930 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1931 1932 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1933 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1934 /* set the matrix */ 1935 CsrMatrix *mat= new CsrMatrix; 1936 mat->num_rows = m; 1937 mat->num_cols = A->cmap->n; 1938 mat->num_entries = nnz; 1939 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1940 mat->row_offsets->assign(ii, ii + m+1); 1941 1942 mat->column_indices = new THRUSTINTARRAY32(nnz); 1943 mat->column_indices->assign(a->j, a->j+nnz); 1944 1945 mat->values = new THRUSTARRAY(nnz); 1946 if (a->a) mat->values->assign(a->a, a->a+nnz); 1947 1948 /* assign the pointer */ 1949 matstruct->mat = mat; 1950 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1951 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1952 stat = cusparseCreateCsr(&matstruct->matDescr, 1953 mat->num_rows, mat->num_cols, mat->num_entries, 1954 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1955 mat->values->data().get(), 1956 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1957 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1958 } 1959 #endif 1960 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1961 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1962 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1963 #else 1964 CsrMatrix *mat= new CsrMatrix; 1965 mat->num_rows = m; 1966 mat->num_cols = A->cmap->n; 1967 mat->num_entries = nnz; 1968 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1969 mat->row_offsets->assign(ii, ii + m+1); 1970 1971 mat->column_indices = new THRUSTINTARRAY32(nnz); 1972 mat->column_indices->assign(a->j, a->j+nnz); 1973 1974 mat->values = new THRUSTARRAY(nnz); 1975 if (a->a) mat->values->assign(a->a, a->a+nnz); 1976 1977 cusparseHybMat_t hybMat; 1978 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1979 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1980 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1981 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1982 matstruct->descr, mat->values->data().get(), 1983 mat->row_offsets->data().get(), 1984 mat->column_indices->data().get(), 1985 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1986 /* assign the pointer */ 1987 matstruct->mat = hybMat; 1988 1989 if (mat) { 1990 if (mat->values) delete (THRUSTARRAY*)mat->values; 1991 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1992 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1993 delete (CsrMatrix*)mat; 1994 } 1995 #endif 1996 } 1997 1998 /* assign the compressed row indices */ 1999 if (a->compressedrow.use) { 2000 cusparsestruct->workVector = new THRUSTARRAY(m); 2001 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2002 matstruct->cprowIndices->assign(ridx,ridx+m); 2003 tmp = m; 2004 } else { 2005 cusparsestruct->workVector = NULL; 2006 matstruct->cprowIndices = NULL; 2007 tmp = 0; 2008 } 2009 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2010 2011 /* assign the pointer */ 2012 cusparsestruct->mat = matstruct; 2013 } catch(char *ex) { 2014 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2015 } 2016 err = WaitForCUDA();CHKERRCUDA(err); 2017 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 2018 cusparsestruct->nonzerostate = A->nonzerostate; 2019 } 2020 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2021 } 2022 PetscFunctionReturn(0); 2023 } 2024 2025 struct VecCUDAPlusEquals 2026 { 2027 template <typename Tuple> 2028 __host__ __device__ 2029 void operator()(Tuple t) 2030 { 2031 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2032 } 2033 }; 2034 2035 struct VecCUDAEquals 2036 { 2037 template <typename Tuple> 2038 __host__ __device__ 2039 void operator()(Tuple t) 2040 { 2041 thrust::get<1>(t) = thrust::get<0>(t); 2042 } 2043 }; 2044 2045 struct VecCUDAEqualsReverse 2046 { 2047 template <typename Tuple> 2048 __host__ __device__ 2049 void operator()(Tuple t) 2050 { 2051 thrust::get<0>(t) = thrust::get<1>(t); 2052 } 2053 }; 2054 2055 struct MatMatCusparse { 2056 PetscBool cisdense; 2057 PetscScalar *Bt; 2058 Mat X; 2059 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2060 PetscLogDouble flops; 2061 CsrMatrix *Bcsr; 2062 2063 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2064 cusparseSpMatDescr_t matSpBDescr; 2065 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2066 cusparseDnMatDescr_t matBDescr; 2067 cusparseDnMatDescr_t matCDescr; 2068 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2069 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2070 void *dBuffer4; 2071 void *dBuffer5; 2072 #endif 2073 size_t mmBufferSize; 2074 void *mmBuffer; 2075 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2076 cusparseSpGEMMDescr_t spgemmDesc; 2077 #endif 2078 }; 2079 2080 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2081 { 2082 PetscErrorCode ierr; 2083 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2084 cudaError_t cerr; 2085 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2086 cusparseStatus_t stat; 2087 #endif 2088 2089 PetscFunctionBegin; 2090 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2091 delete mmdata->Bcsr; 2092 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2093 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2094 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2095 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2096 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2097 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2098 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2099 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2100 #endif 2101 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2102 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2103 #endif 2104 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2105 ierr = PetscFree(data);CHKERRQ(ierr); 2106 PetscFunctionReturn(0); 2107 } 2108 2109 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2110 2111 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2112 { 2113 Mat_Product *product = C->product; 2114 Mat A,B; 2115 PetscInt m,n,blda,clda; 2116 PetscBool flg,biscuda; 2117 Mat_SeqAIJCUSPARSE *cusp; 2118 cusparseStatus_t stat; 2119 cusparseOperation_t opA; 2120 const PetscScalar *barray; 2121 PetscScalar *carray; 2122 PetscErrorCode ierr; 2123 MatMatCusparse *mmdata; 2124 Mat_SeqAIJCUSPARSEMultStruct *mat; 2125 CsrMatrix *csrmat; 2126 2127 PetscFunctionBegin; 2128 MatCheckProduct(C,1); 2129 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2130 mmdata = (MatMatCusparse*)product->data; 2131 A = product->A; 2132 B = product->B; 2133 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2134 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2135 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2136 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2137 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2138 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2139 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2140 switch (product->type) { 2141 case MATPRODUCT_AB: 2142 case MATPRODUCT_PtAP: 2143 mat = cusp->mat; 2144 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2145 m = A->rmap->n; 2146 n = B->cmap->n; 2147 break; 2148 case MATPRODUCT_AtB: 2149 if (!A->form_explicit_transpose) { 2150 mat = cusp->mat; 2151 opA = CUSPARSE_OPERATION_TRANSPOSE; 2152 } else { 2153 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2154 mat = cusp->matTranspose; 2155 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2156 } 2157 m = A->cmap->n; 2158 n = B->cmap->n; 2159 break; 2160 case MATPRODUCT_ABt: 2161 case MATPRODUCT_RARt: 2162 mat = cusp->mat; 2163 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2164 m = A->rmap->n; 2165 n = B->rmap->n; 2166 break; 2167 default: 2168 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2169 } 2170 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2171 csrmat = (CsrMatrix*)mat->mat; 2172 /* if the user passed a CPU matrix, copy the data to the GPU */ 2173 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2174 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2175 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2176 2177 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2178 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2179 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2180 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2181 } else { 2182 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2183 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2184 } 2185 2186 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2187 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2188 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2189 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2190 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2191 size_t mmBufferSize; 2192 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2193 if (!mmdata->matBDescr) { 2194 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2195 mmdata->Blda = blda; 2196 } 2197 2198 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2199 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2200 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2201 mmdata->Clda = clda; 2202 } 2203 2204 if (!mat->matDescr) { 2205 stat = cusparseCreateCsr(&mat->matDescr, 2206 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2207 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2208 csrmat->values->data().get(), 2209 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2210 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2211 } 2212 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2213 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2214 mmdata->matCDescr,cusparse_scalartype, 2215 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2216 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2217 cudaError_t cerr; 2218 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2219 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2220 mmdata->mmBufferSize = mmBufferSize; 2221 } 2222 mmdata->initialized = PETSC_TRUE; 2223 } else { 2224 /* to be safe, always update pointers of the mats */ 2225 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2226 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2227 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2228 } 2229 2230 /* do cusparseSpMM, which supports transpose on B */ 2231 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2232 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2233 mmdata->matCDescr,cusparse_scalartype, 2234 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2235 #else 2236 PetscInt k; 2237 /* cusparseXcsrmm does not support transpose on B */ 2238 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2239 cublasHandle_t cublasv2handle; 2240 cublasStatus_t cerr; 2241 2242 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2243 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2244 B->cmap->n,B->rmap->n, 2245 &PETSC_CUSPARSE_ONE ,barray,blda, 2246 &PETSC_CUSPARSE_ZERO,barray,blda, 2247 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2248 blda = B->cmap->n; 2249 k = B->cmap->n; 2250 } else { 2251 k = B->rmap->n; 2252 } 2253 2254 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2255 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2256 csrmat->num_entries,mat->alpha_one,mat->descr, 2257 csrmat->values->data().get(), 2258 csrmat->row_offsets->data().get(), 2259 csrmat->column_indices->data().get(), 2260 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2261 carray,clda);CHKERRCUSPARSE(stat); 2262 #endif 2263 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2264 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2265 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2266 if (product->type == MATPRODUCT_RARt) { 2267 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2268 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2269 } else if (product->type == MATPRODUCT_PtAP) { 2270 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2271 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2272 } else { 2273 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2274 } 2275 if (mmdata->cisdense) { 2276 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2277 } 2278 if (!biscuda) { 2279 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2280 } 2281 PetscFunctionReturn(0); 2282 } 2283 2284 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2285 { 2286 Mat_Product *product = C->product; 2287 Mat A,B; 2288 PetscInt m,n; 2289 PetscBool cisdense,flg; 2290 PetscErrorCode ierr; 2291 MatMatCusparse *mmdata; 2292 Mat_SeqAIJCUSPARSE *cusp; 2293 2294 PetscFunctionBegin; 2295 MatCheckProduct(C,1); 2296 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2297 A = product->A; 2298 B = product->B; 2299 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2300 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2301 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2302 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2303 switch (product->type) { 2304 case MATPRODUCT_AB: 2305 m = A->rmap->n; 2306 n = B->cmap->n; 2307 break; 2308 case MATPRODUCT_AtB: 2309 m = A->cmap->n; 2310 n = B->cmap->n; 2311 break; 2312 case MATPRODUCT_ABt: 2313 m = A->rmap->n; 2314 n = B->rmap->n; 2315 break; 2316 case MATPRODUCT_PtAP: 2317 m = B->cmap->n; 2318 n = B->cmap->n; 2319 break; 2320 case MATPRODUCT_RARt: 2321 m = B->rmap->n; 2322 n = B->rmap->n; 2323 break; 2324 default: 2325 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2326 } 2327 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2328 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2329 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2330 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2331 2332 /* product data */ 2333 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2334 mmdata->cisdense = cisdense; 2335 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2336 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2337 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2338 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2339 } 2340 #endif 2341 /* for these products we need intermediate storage */ 2342 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2343 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2344 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2345 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2346 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2347 } else { 2348 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2349 } 2350 } 2351 C->product->data = mmdata; 2352 C->product->destroy = MatDestroy_MatMatCusparse; 2353 2354 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2355 PetscFunctionReturn(0); 2356 } 2357 2358 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2359 { 2360 Mat_Product *product = C->product; 2361 Mat A,B; 2362 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2363 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2364 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2365 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2366 PetscBool flg; 2367 PetscErrorCode ierr; 2368 cusparseStatus_t stat; 2369 cudaError_t cerr; 2370 MatProductType ptype; 2371 MatMatCusparse *mmdata; 2372 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2373 cusparseSpMatDescr_t BmatSpDescr; 2374 #endif 2375 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2376 2377 PetscFunctionBegin; 2378 MatCheckProduct(C,1); 2379 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2380 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2381 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2382 mmdata = (MatMatCusparse*)C->product->data; 2383 A = product->A; 2384 B = product->B; 2385 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2386 mmdata->reusesym = PETSC_FALSE; 2387 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2388 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2389 Cmat = Ccusp->mat; 2390 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2391 Ccsr = (CsrMatrix*)Cmat->mat; 2392 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2393 goto finalize; 2394 } 2395 if (!c->nz) goto finalize; 2396 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2397 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2398 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2399 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2400 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2401 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2402 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2403 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2404 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2405 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2406 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2407 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2408 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2409 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2410 2411 ptype = product->type; 2412 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2413 ptype = MATPRODUCT_AB; 2414 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2415 } 2416 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2417 ptype = MATPRODUCT_AB; 2418 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2419 } 2420 switch (ptype) { 2421 case MATPRODUCT_AB: 2422 Amat = Acusp->mat; 2423 Bmat = Bcusp->mat; 2424 break; 2425 case MATPRODUCT_AtB: 2426 Amat = Acusp->matTranspose; 2427 Bmat = Bcusp->mat; 2428 break; 2429 case MATPRODUCT_ABt: 2430 Amat = Acusp->mat; 2431 Bmat = Bcusp->matTranspose; 2432 break; 2433 default: 2434 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2435 } 2436 Cmat = Ccusp->mat; 2437 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2438 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2439 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2440 Acsr = (CsrMatrix*)Amat->mat; 2441 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2442 Ccsr = (CsrMatrix*)Cmat->mat; 2443 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2444 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2445 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2446 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2447 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2448 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2449 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2450 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2451 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2452 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2453 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2454 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2455 #else 2456 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2457 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2458 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2459 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2460 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2461 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2462 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2463 #endif 2464 #else 2465 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2466 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2467 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2468 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2469 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2470 #endif 2471 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2472 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2473 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2474 C->offloadmask = PETSC_OFFLOAD_GPU; 2475 finalize: 2476 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2477 ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2478 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2479 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 2480 c->reallocs = 0; 2481 C->info.mallocs += 0; 2482 C->info.nz_unneeded = 0; 2483 C->assembled = C->was_assembled = PETSC_TRUE; 2484 C->num_ass++; 2485 PetscFunctionReturn(0); 2486 } 2487 2488 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2489 { 2490 Mat_Product *product = C->product; 2491 Mat A,B; 2492 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2493 Mat_SeqAIJ *a,*b,*c; 2494 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2495 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2496 PetscInt i,j,m,n,k; 2497 PetscBool flg; 2498 PetscErrorCode ierr; 2499 cusparseStatus_t stat; 2500 cudaError_t cerr; 2501 MatProductType ptype; 2502 MatMatCusparse *mmdata; 2503 PetscLogDouble flops; 2504 PetscBool biscompressed,ciscompressed; 2505 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2506 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2507 cusparseSpMatDescr_t BmatSpDescr; 2508 #else 2509 int cnz; 2510 #endif 2511 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2512 2513 PetscFunctionBegin; 2514 MatCheckProduct(C,1); 2515 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2516 A = product->A; 2517 B = product->B; 2518 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2519 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2520 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2521 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2522 a = (Mat_SeqAIJ*)A->data; 2523 b = (Mat_SeqAIJ*)B->data; 2524 /* product data */ 2525 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2526 C->product->data = mmdata; 2527 C->product->destroy = MatDestroy_MatMatCusparse; 2528 2529 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2530 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2531 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2532 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2533 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2534 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2535 2536 ptype = product->type; 2537 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2538 ptype = MATPRODUCT_AB; 2539 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2540 } 2541 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2542 ptype = MATPRODUCT_AB; 2543 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2544 } 2545 biscompressed = PETSC_FALSE; 2546 ciscompressed = PETSC_FALSE; 2547 switch (ptype) { 2548 case MATPRODUCT_AB: 2549 m = A->rmap->n; 2550 n = B->cmap->n; 2551 k = A->cmap->n; 2552 Amat = Acusp->mat; 2553 Bmat = Bcusp->mat; 2554 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2555 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2556 break; 2557 case MATPRODUCT_AtB: 2558 m = A->cmap->n; 2559 n = B->cmap->n; 2560 k = A->rmap->n; 2561 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2562 Amat = Acusp->matTranspose; 2563 Bmat = Bcusp->mat; 2564 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2565 break; 2566 case MATPRODUCT_ABt: 2567 m = A->rmap->n; 2568 n = B->rmap->n; 2569 k = A->cmap->n; 2570 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2571 Amat = Acusp->mat; 2572 Bmat = Bcusp->matTranspose; 2573 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2574 break; 2575 default: 2576 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2577 } 2578 2579 /* create cusparse matrix */ 2580 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2581 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2582 c = (Mat_SeqAIJ*)C->data; 2583 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2584 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2585 Ccsr = new CsrMatrix; 2586 2587 c->compressedrow.use = ciscompressed; 2588 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2589 c->compressedrow.nrows = a->compressedrow.nrows; 2590 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2591 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2592 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2593 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2594 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2595 } else { 2596 c->compressedrow.nrows = 0; 2597 c->compressedrow.i = NULL; 2598 c->compressedrow.rindex = NULL; 2599 Ccusp->workVector = NULL; 2600 Cmat->cprowIndices = NULL; 2601 } 2602 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2603 Ccusp->mat = Cmat; 2604 Ccusp->mat->mat = Ccsr; 2605 Ccsr->num_rows = Ccusp->nrows; 2606 Ccsr->num_cols = n; 2607 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2608 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2609 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2610 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2611 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2612 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2613 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2614 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2615 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2616 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2617 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2618 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2619 c->nz = 0; 2620 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2621 Ccsr->values = new THRUSTARRAY(c->nz); 2622 goto finalizesym; 2623 } 2624 2625 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2626 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2627 Acsr = (CsrMatrix*)Amat->mat; 2628 if (!biscompressed) { 2629 Bcsr = (CsrMatrix*)Bmat->mat; 2630 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2631 BmatSpDescr = Bmat->matDescr; 2632 #endif 2633 } else { /* we need to use row offsets for the full matrix */ 2634 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2635 Bcsr = new CsrMatrix; 2636 Bcsr->num_rows = B->rmap->n; 2637 Bcsr->num_cols = cBcsr->num_cols; 2638 Bcsr->num_entries = cBcsr->num_entries; 2639 Bcsr->column_indices = cBcsr->column_indices; 2640 Bcsr->values = cBcsr->values; 2641 if (!Bcusp->rowoffsets_gpu) { 2642 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2643 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2644 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2645 } 2646 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2647 mmdata->Bcsr = Bcsr; 2648 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2649 if (Bcsr->num_rows && Bcsr->num_cols) { 2650 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2651 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2652 Bcsr->values->data().get(), 2653 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2654 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2655 } 2656 BmatSpDescr = mmdata->matSpBDescr; 2657 #endif 2658 } 2659 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2660 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2661 /* precompute flops count */ 2662 if (ptype == MATPRODUCT_AB) { 2663 for (i=0, flops = 0; i<A->rmap->n; i++) { 2664 const PetscInt st = a->i[i]; 2665 const PetscInt en = a->i[i+1]; 2666 for (j=st; j<en; j++) { 2667 const PetscInt brow = a->j[j]; 2668 flops += 2.*(b->i[brow+1] - b->i[brow]); 2669 } 2670 } 2671 } else if (ptype == MATPRODUCT_AtB) { 2672 for (i=0, flops = 0; i<A->rmap->n; i++) { 2673 const PetscInt anzi = a->i[i+1] - a->i[i]; 2674 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2675 flops += (2.*anzi)*bnzi; 2676 } 2677 } else { /* TODO */ 2678 flops = 0.; 2679 } 2680 2681 mmdata->flops = flops; 2682 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2683 2684 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2685 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2686 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2687 NULL, NULL, NULL, 2688 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2689 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2690 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2691 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2692 { 2693 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2694 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2695 */ 2696 void* dBuffer1 = NULL; 2697 void* dBuffer2 = NULL; 2698 void* dBuffer3 = NULL; 2699 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2700 size_t bufferSize1 = 0; 2701 size_t bufferSize2 = 0; 2702 size_t bufferSize3 = 0; 2703 size_t bufferSize4 = 0; 2704 size_t bufferSize5 = 0; 2705 2706 /*----------------------------------------------------------------------*/ 2707 /* ask bufferSize1 bytes for external memory */ 2708 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2709 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2710 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2711 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2712 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2713 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2714 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2715 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2716 2717 /*----------------------------------------------------------------------*/ 2718 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2719 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2720 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2721 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2722 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2723 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2724 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2725 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2726 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2727 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2728 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2729 2730 /*----------------------------------------------------------------------*/ 2731 /* get matrix C non-zero entries C_nnz1 */ 2732 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2733 c->nz = (PetscInt) C_nnz1; 2734 /* allocate matrix C */ 2735 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2736 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2737 /* update matC with the new pointers */ 2738 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2739 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2740 2741 /*----------------------------------------------------------------------*/ 2742 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2743 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2744 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2745 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2746 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2747 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2748 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2749 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2750 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2751 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2752 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2753 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2754 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2755 } 2756 #else 2757 size_t bufSize2; 2758 /* ask bufferSize bytes for external memory */ 2759 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2760 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2761 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2762 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2763 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2764 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2765 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2766 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2767 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2768 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2769 /* ask bufferSize again bytes for external memory */ 2770 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2771 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2772 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2773 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2774 /* The CUSPARSE documentation is not clear, nor the API 2775 We need both buffers to perform the operations properly! 2776 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2777 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2778 is stored in the descriptor! What a messy API... */ 2779 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2780 /* compute the intermediate product of A * B */ 2781 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2782 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2783 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2784 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2785 /* get matrix C non-zero entries C_nnz1 */ 2786 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2787 c->nz = (PetscInt) C_nnz1; 2788 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2789 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2790 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2791 Ccsr->values = new THRUSTARRAY(c->nz); 2792 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2793 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2794 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2795 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2796 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2797 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2798 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2799 #else 2800 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2801 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2802 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2803 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2804 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2805 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2806 c->nz = cnz; 2807 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2808 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2809 Ccsr->values = new THRUSTARRAY(c->nz); 2810 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2811 2812 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2813 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2814 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2815 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2816 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2817 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2818 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2819 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2820 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2821 #endif 2822 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2823 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2824 finalizesym: 2825 c->singlemalloc = PETSC_FALSE; 2826 c->free_a = PETSC_TRUE; 2827 c->free_ij = PETSC_TRUE; 2828 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2829 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2830 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2831 PetscInt *d_i = c->i; 2832 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2833 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2834 ii = *Ccsr->row_offsets; 2835 jj = *Ccsr->column_indices; 2836 if (ciscompressed) d_i = c->compressedrow.i; 2837 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2838 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2839 } else { 2840 PetscInt *d_i = c->i; 2841 if (ciscompressed) d_i = c->compressedrow.i; 2842 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2843 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2844 } 2845 if (ciscompressed) { /* need to expand host row offsets */ 2846 PetscInt r = 0; 2847 c->i[0] = 0; 2848 for (k = 0; k < c->compressedrow.nrows; k++) { 2849 const PetscInt next = c->compressedrow.rindex[k]; 2850 const PetscInt old = c->compressedrow.i[k]; 2851 for (; r < next; r++) c->i[r+1] = old; 2852 } 2853 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2854 } 2855 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2856 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2857 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2858 c->maxnz = c->nz; 2859 c->nonzerorowcnt = 0; 2860 c->rmax = 0; 2861 for (k = 0; k < m; k++) { 2862 const PetscInt nn = c->i[k+1] - c->i[k]; 2863 c->ilen[k] = c->imax[k] = nn; 2864 c->nonzerorowcnt += (PetscInt)!!nn; 2865 c->rmax = PetscMax(c->rmax,nn); 2866 } 2867 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2868 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2869 Ccsr->num_entries = c->nz; 2870 2871 C->nonzerostate++; 2872 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2873 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2874 Ccusp->nonzerostate = C->nonzerostate; 2875 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2876 C->preallocated = PETSC_TRUE; 2877 C->assembled = PETSC_FALSE; 2878 C->was_assembled = PETSC_FALSE; 2879 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2880 mmdata->reusesym = PETSC_TRUE; 2881 C->offloadmask = PETSC_OFFLOAD_GPU; 2882 } 2883 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2884 PetscFunctionReturn(0); 2885 } 2886 2887 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2888 2889 /* handles sparse or dense B */ 2890 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2891 { 2892 Mat_Product *product = mat->product; 2893 PetscErrorCode ierr; 2894 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2895 2896 PetscFunctionBegin; 2897 MatCheckProduct(mat,1); 2898 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2899 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2900 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2901 } 2902 if (product->type == MATPRODUCT_ABC) { 2903 Ciscusp = PETSC_FALSE; 2904 if (!product->C->boundtocpu) { 2905 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2906 } 2907 } 2908 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2909 PetscBool usecpu = PETSC_FALSE; 2910 switch (product->type) { 2911 case MATPRODUCT_AB: 2912 if (product->api_user) { 2913 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2914 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2915 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2916 } else { 2917 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2918 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2919 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2920 } 2921 break; 2922 case MATPRODUCT_AtB: 2923 if (product->api_user) { 2924 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2925 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2926 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2927 } else { 2928 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2929 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2930 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2931 } 2932 break; 2933 case MATPRODUCT_PtAP: 2934 if (product->api_user) { 2935 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2936 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2937 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2938 } else { 2939 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2940 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2941 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2942 } 2943 break; 2944 case MATPRODUCT_RARt: 2945 if (product->api_user) { 2946 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2947 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2948 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2949 } else { 2950 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2951 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2952 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2953 } 2954 break; 2955 case MATPRODUCT_ABC: 2956 if (product->api_user) { 2957 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2958 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2959 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2960 } else { 2961 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2962 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2963 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2964 } 2965 break; 2966 default: 2967 break; 2968 } 2969 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2970 } 2971 /* dispatch */ 2972 if (isdense) { 2973 switch (product->type) { 2974 case MATPRODUCT_AB: 2975 case MATPRODUCT_AtB: 2976 case MATPRODUCT_ABt: 2977 case MATPRODUCT_PtAP: 2978 case MATPRODUCT_RARt: 2979 if (product->A->boundtocpu) { 2980 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2981 } else { 2982 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2983 } 2984 break; 2985 case MATPRODUCT_ABC: 2986 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2987 break; 2988 default: 2989 break; 2990 } 2991 } else if (Biscusp && Ciscusp) { 2992 switch (product->type) { 2993 case MATPRODUCT_AB: 2994 case MATPRODUCT_AtB: 2995 case MATPRODUCT_ABt: 2996 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2997 break; 2998 case MATPRODUCT_PtAP: 2999 case MATPRODUCT_RARt: 3000 case MATPRODUCT_ABC: 3001 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3002 break; 3003 default: 3004 break; 3005 } 3006 } else { /* fallback for AIJ */ 3007 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3008 } 3009 PetscFunctionReturn(0); 3010 } 3011 3012 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3013 { 3014 PetscErrorCode ierr; 3015 3016 PetscFunctionBegin; 3017 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3018 PetscFunctionReturn(0); 3019 } 3020 3021 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3022 { 3023 PetscErrorCode ierr; 3024 3025 PetscFunctionBegin; 3026 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3027 PetscFunctionReturn(0); 3028 } 3029 3030 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3031 { 3032 PetscErrorCode ierr; 3033 3034 PetscFunctionBegin; 3035 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3036 PetscFunctionReturn(0); 3037 } 3038 3039 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3040 { 3041 PetscErrorCode ierr; 3042 3043 PetscFunctionBegin; 3044 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3045 PetscFunctionReturn(0); 3046 } 3047 3048 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3049 { 3050 PetscErrorCode ierr; 3051 3052 PetscFunctionBegin; 3053 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3054 PetscFunctionReturn(0); 3055 } 3056 3057 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3058 { 3059 int i = blockIdx.x*blockDim.x + threadIdx.x; 3060 if (i < n) y[idx[i]] += x[i]; 3061 } 3062 3063 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3064 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3065 { 3066 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3067 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3068 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3069 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3070 PetscErrorCode ierr; 3071 cusparseStatus_t stat; 3072 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3073 PetscBool compressed; 3074 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3075 PetscInt nx,ny; 3076 #endif 3077 3078 PetscFunctionBegin; 3079 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3080 if (!a->nonzerorowcnt) { 3081 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3082 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3083 PetscFunctionReturn(0); 3084 } 3085 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3086 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3087 if (!trans) { 3088 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3089 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3090 } else { 3091 if (herm || !A->form_explicit_transpose) { 3092 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3093 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3094 } else { 3095 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3096 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3097 } 3098 } 3099 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3100 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3101 3102 try { 3103 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3104 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3105 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3106 3107 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3108 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3109 /* z = A x + beta y. 3110 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3111 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3112 */ 3113 xptr = xarray; 3114 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3115 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3116 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3117 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3118 allocated to accommodate different uses. So we get the length info directly from mat. 3119 */ 3120 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3121 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3122 nx = mat->num_cols; 3123 ny = mat->num_rows; 3124 } 3125 #endif 3126 } else { 3127 /* z = A^T x + beta y 3128 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3129 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3130 */ 3131 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3132 dptr = zarray; 3133 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3134 if (compressed) { /* Scatter x to work vector */ 3135 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3136 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3137 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3138 VecCUDAEqualsReverse()); 3139 } 3140 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3141 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3142 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3143 nx = mat->num_rows; 3144 ny = mat->num_cols; 3145 } 3146 #endif 3147 } 3148 3149 /* csr_spmv does y = alpha op(A) x + beta y */ 3150 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3151 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3152 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3153 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3154 cudaError_t cerr; 3155 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3156 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3157 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3158 matstruct->matDescr, 3159 matstruct->cuSpMV[opA].vecXDescr, beta, 3160 matstruct->cuSpMV[opA].vecYDescr, 3161 cusparse_scalartype, 3162 cusparsestruct->spmvAlg, 3163 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3164 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3165 3166 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3167 } else { 3168 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3169 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3170 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3171 } 3172 3173 stat = cusparseSpMV(cusparsestruct->handle, opA, 3174 matstruct->alpha_one, 3175 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3176 matstruct->cuSpMV[opA].vecXDescr, 3177 beta, 3178 matstruct->cuSpMV[opA].vecYDescr, 3179 cusparse_scalartype, 3180 cusparsestruct->spmvAlg, 3181 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3182 #else 3183 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3184 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3185 mat->num_rows, mat->num_cols, 3186 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3187 mat->values->data().get(), mat->row_offsets->data().get(), 3188 mat->column_indices->data().get(), xptr, beta, 3189 dptr);CHKERRCUSPARSE(stat); 3190 #endif 3191 } else { 3192 if (cusparsestruct->nrows) { 3193 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3194 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3195 #else 3196 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3197 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3198 matstruct->alpha_one, matstruct->descr, hybMat, 3199 xptr, beta, 3200 dptr);CHKERRCUSPARSE(stat); 3201 #endif 3202 } 3203 } 3204 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3205 3206 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3207 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3208 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3209 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3210 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3211 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3212 } 3213 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3214 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3215 } 3216 3217 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3218 if (compressed) { 3219 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3220 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3221 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3222 prevent that. So I just add a ScatterAdd kernel. 3223 */ 3224 #if 0 3225 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3226 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3227 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3228 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3229 VecCUDAPlusEquals()); 3230 #else 3231 PetscInt n = matstruct->cprowIndices->size(); 3232 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3233 #endif 3234 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3235 } 3236 } else { 3237 if (yy && yy != zz) { 3238 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3239 } 3240 } 3241 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3242 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3243 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3244 } catch(char *ex) { 3245 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3246 } 3247 if (yy) { 3248 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3249 } else { 3250 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3251 } 3252 PetscFunctionReturn(0); 3253 } 3254 3255 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3256 { 3257 PetscErrorCode ierr; 3258 3259 PetscFunctionBegin; 3260 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3261 PetscFunctionReturn(0); 3262 } 3263 3264 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3265 { 3266 PetscErrorCode ierr; 3267 PetscObjectState onnz = A->nonzerostate; 3268 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3269 3270 PetscFunctionBegin; 3271 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3272 if (onnz != A->nonzerostate && cusp->deviceMat) { 3273 cudaError_t cerr; 3274 3275 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3276 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3277 cusp->deviceMat = NULL; 3278 } 3279 PetscFunctionReturn(0); 3280 } 3281 3282 /* --------------------------------------------------------------------------------*/ 3283 /*@ 3284 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3285 (the default parallel PETSc format). This matrix will ultimately pushed down 3286 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3287 assembly performance the user should preallocate the matrix storage by setting 3288 the parameter nz (or the array nnz). By setting these parameters accurately, 3289 performance during matrix assembly can be increased by more than a factor of 50. 3290 3291 Collective 3292 3293 Input Parameters: 3294 + comm - MPI communicator, set to PETSC_COMM_SELF 3295 . m - number of rows 3296 . n - number of columns 3297 . nz - number of nonzeros per row (same for all rows) 3298 - nnz - array containing the number of nonzeros in the various rows 3299 (possibly different for each row) or NULL 3300 3301 Output Parameter: 3302 . A - the matrix 3303 3304 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3305 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3306 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3307 3308 Notes: 3309 If nnz is given then nz is ignored 3310 3311 The AIJ format (also called the Yale sparse matrix format or 3312 compressed row storage), is fully compatible with standard Fortran 77 3313 storage. That is, the stored row and column indices can begin at 3314 either one (as in Fortran) or zero. See the users' manual for details. 3315 3316 Specify the preallocated storage with either nz or nnz (not both). 3317 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3318 allocation. For large problems you MUST preallocate memory or you 3319 will get TERRIBLE performance, see the users' manual chapter on matrices. 3320 3321 By default, this format uses inodes (identical nodes) when possible, to 3322 improve numerical efficiency of matrix-vector products and solves. We 3323 search for consecutive rows with the same nonzero structure, thereby 3324 reusing matrix information to achieve increased efficiency. 3325 3326 Level: intermediate 3327 3328 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3329 @*/ 3330 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3331 { 3332 PetscErrorCode ierr; 3333 3334 PetscFunctionBegin; 3335 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3336 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3337 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3338 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3339 PetscFunctionReturn(0); 3340 } 3341 3342 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3343 { 3344 PetscErrorCode ierr; 3345 3346 PetscFunctionBegin; 3347 if (A->factortype == MAT_FACTOR_NONE) { 3348 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3349 } else { 3350 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3351 } 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3357 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3358 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3359 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3360 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3361 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3362 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3363 PetscFunctionReturn(0); 3364 } 3365 3366 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3367 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3368 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3369 { 3370 PetscErrorCode ierr; 3371 3372 PetscFunctionBegin; 3373 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3374 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3375 PetscFunctionReturn(0); 3376 } 3377 3378 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3379 { 3380 PetscErrorCode ierr; 3381 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3382 Mat_SeqAIJCUSPARSE *cy; 3383 Mat_SeqAIJCUSPARSE *cx; 3384 PetscScalar *ay; 3385 const PetscScalar *ax; 3386 CsrMatrix *csry,*csrx; 3387 3388 PetscFunctionBegin; 3389 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3390 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3391 if (X->ops->axpy != Y->ops->axpy) { 3392 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3393 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3394 PetscFunctionReturn(0); 3395 } 3396 /* if we are here, it means both matrices are bound to GPU */ 3397 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3398 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3399 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3400 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3401 csry = (CsrMatrix*)cy->mat->mat; 3402 csrx = (CsrMatrix*)cx->mat->mat; 3403 /* see if we can turn this into a cublas axpy */ 3404 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3405 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3406 if (eq) { 3407 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3408 } 3409 if (eq) str = SAME_NONZERO_PATTERN; 3410 } 3411 /* spgeam is buggy with one column */ 3412 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3413 3414 if (str == SUBSET_NONZERO_PATTERN) { 3415 cusparseStatus_t stat; 3416 PetscScalar b = 1.0; 3417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3418 size_t bufferSize; 3419 void *buffer; 3420 cudaError_t cerr; 3421 #endif 3422 3423 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3424 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3425 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3426 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3427 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3428 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3429 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3430 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3431 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3432 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3433 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3434 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3435 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3436 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3437 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3438 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3439 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3440 #else 3441 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3442 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3443 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3444 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3445 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3446 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3447 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3448 #endif 3449 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3450 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3451 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3452 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3453 } else if (str == SAME_NONZERO_PATTERN) { 3454 cublasHandle_t cublasv2handle; 3455 cublasStatus_t berr; 3456 PetscBLASInt one = 1, bnz = 1; 3457 3458 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3459 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3460 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3461 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3462 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3463 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3464 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3465 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3466 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3467 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3468 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3469 } else { 3470 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3471 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3472 } 3473 PetscFunctionReturn(0); 3474 } 3475 3476 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3477 { 3478 PetscErrorCode ierr; 3479 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3480 PetscScalar *ay; 3481 cublasHandle_t cublasv2handle; 3482 cublasStatus_t berr; 3483 PetscBLASInt one = 1, bnz = 1; 3484 3485 PetscFunctionBegin; 3486 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3487 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3488 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3489 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3490 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3491 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3492 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3493 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3494 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3495 PetscFunctionReturn(0); 3496 } 3497 3498 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3499 { 3500 PetscErrorCode ierr; 3501 PetscBool both = PETSC_FALSE; 3502 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3503 3504 PetscFunctionBegin; 3505 if (A->factortype == MAT_FACTOR_NONE) { 3506 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3507 if (spptr->mat) { 3508 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3509 if (matrix->values) { 3510 both = PETSC_TRUE; 3511 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3512 } 3513 } 3514 if (spptr->matTranspose) { 3515 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3516 if (matrix->values) { 3517 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3518 } 3519 } 3520 } 3521 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3522 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3523 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3524 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3525 else A->offloadmask = PETSC_OFFLOAD_CPU; 3526 PetscFunctionReturn(0); 3527 } 3528 3529 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3530 { 3531 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3532 PetscErrorCode ierr; 3533 3534 PetscFunctionBegin; 3535 if (A->factortype != MAT_FACTOR_NONE) { 3536 A->boundtocpu = flg; 3537 PetscFunctionReturn(0); 3538 } 3539 if (flg) { 3540 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3541 3542 A->ops->scale = MatScale_SeqAIJ; 3543 A->ops->axpy = MatAXPY_SeqAIJ; 3544 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3545 A->ops->mult = MatMult_SeqAIJ; 3546 A->ops->multadd = MatMultAdd_SeqAIJ; 3547 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3548 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3549 A->ops->multhermitiantranspose = NULL; 3550 A->ops->multhermitiantransposeadd = NULL; 3551 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3552 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3553 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3554 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3555 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3556 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3557 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3558 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3559 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3560 } else { 3561 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3562 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3563 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3564 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3565 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3566 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3567 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3568 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3569 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3570 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3571 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3572 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3573 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3574 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3575 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3576 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3577 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3579 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3580 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3581 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3582 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3583 } 3584 A->boundtocpu = flg; 3585 if (flg && a->inode.size) { 3586 a->inode.use = PETSC_TRUE; 3587 } else { 3588 a->inode.use = PETSC_FALSE; 3589 } 3590 PetscFunctionReturn(0); 3591 } 3592 3593 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3594 { 3595 PetscErrorCode ierr; 3596 cusparseStatus_t stat; 3597 Mat B; 3598 3599 PetscFunctionBegin; 3600 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3601 if (reuse == MAT_INITIAL_MATRIX) { 3602 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3603 } else if (reuse == MAT_REUSE_MATRIX) { 3604 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3605 } 3606 B = *newmat; 3607 3608 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3609 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3610 3611 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3612 if (B->factortype == MAT_FACTOR_NONE) { 3613 Mat_SeqAIJCUSPARSE *spptr; 3614 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3615 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3616 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3617 spptr->format = MAT_CUSPARSE_CSR; 3618 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3619 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3620 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3621 #else 3622 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3623 #endif 3624 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3625 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3626 #endif 3627 B->spptr = spptr; 3628 } else { 3629 Mat_SeqAIJCUSPARSETriFactors *spptr; 3630 3631 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3632 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3633 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3634 B->spptr = spptr; 3635 } 3636 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3637 } 3638 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3639 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3640 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3641 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3642 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3643 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3644 3645 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3646 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3647 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3648 #if defined(PETSC_HAVE_HYPRE) 3649 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3650 #endif 3651 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 3652 PetscFunctionReturn(0); 3653 } 3654 3655 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3656 { 3657 PetscErrorCode ierr; 3658 3659 PetscFunctionBegin; 3660 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3661 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3662 PetscFunctionReturn(0); 3663 } 3664 3665 /*MC 3666 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3667 3668 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3669 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3670 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3671 3672 Options Database Keys: 3673 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3674 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3675 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3676 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3677 3678 Level: beginner 3679 3680 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3681 M*/ 3682 3683 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3684 3685 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3686 { 3687 PetscErrorCode ierr; 3688 3689 PetscFunctionBegin; 3690 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3691 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3692 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3693 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3694 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3695 3696 PetscFunctionReturn(0); 3697 } 3698 3699 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3700 { 3701 PetscErrorCode ierr; 3702 cusparseStatus_t stat; 3703 3704 PetscFunctionBegin; 3705 if (*cusparsestruct) { 3706 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3707 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3708 delete (*cusparsestruct)->workVector; 3709 delete (*cusparsestruct)->rowoffsets_gpu; 3710 delete (*cusparsestruct)->cooPerm; 3711 delete (*cusparsestruct)->cooPerm_a; 3712 delete (*cusparsestruct)->csr2csc_i; 3713 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3714 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3715 } 3716 PetscFunctionReturn(0); 3717 } 3718 3719 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3720 { 3721 PetscFunctionBegin; 3722 if (*mat) { 3723 delete (*mat)->values; 3724 delete (*mat)->column_indices; 3725 delete (*mat)->row_offsets; 3726 delete *mat; 3727 *mat = 0; 3728 } 3729 PetscFunctionReturn(0); 3730 } 3731 3732 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3733 { 3734 cusparseStatus_t stat; 3735 PetscErrorCode ierr; 3736 3737 PetscFunctionBegin; 3738 if (*trifactor) { 3739 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3740 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3741 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3742 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3743 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3744 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3745 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3746 #endif 3747 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3748 } 3749 PetscFunctionReturn(0); 3750 } 3751 3752 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3753 { 3754 CsrMatrix *mat; 3755 cusparseStatus_t stat; 3756 cudaError_t err; 3757 3758 PetscFunctionBegin; 3759 if (*matstruct) { 3760 if ((*matstruct)->mat) { 3761 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3762 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3763 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3764 #else 3765 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3766 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3767 #endif 3768 } else { 3769 mat = (CsrMatrix*)(*matstruct)->mat; 3770 CsrMatrix_Destroy(&mat); 3771 } 3772 } 3773 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3774 delete (*matstruct)->cprowIndices; 3775 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3776 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3777 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3778 3779 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3780 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3781 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3782 for (int i=0; i<3; i++) { 3783 if (mdata->cuSpMV[i].initialized) { 3784 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3785 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3786 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3787 } 3788 } 3789 #endif 3790 delete *matstruct; 3791 *matstruct = NULL; 3792 } 3793 PetscFunctionReturn(0); 3794 } 3795 3796 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3797 { 3798 PetscErrorCode ierr; 3799 3800 PetscFunctionBegin; 3801 if (*trifactors) { 3802 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3803 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3804 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3805 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3806 delete (*trifactors)->rpermIndices; 3807 delete (*trifactors)->cpermIndices; 3808 delete (*trifactors)->workVector; 3809 (*trifactors)->rpermIndices = NULL; 3810 (*trifactors)->cpermIndices = NULL; 3811 (*trifactors)->workVector = NULL; 3812 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3813 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3814 (*trifactors)->init_dev_prop = PETSC_FALSE; 3815 } 3816 PetscFunctionReturn(0); 3817 } 3818 3819 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3820 { 3821 PetscErrorCode ierr; 3822 cusparseHandle_t handle; 3823 cusparseStatus_t stat; 3824 3825 PetscFunctionBegin; 3826 if (*trifactors) { 3827 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3828 if (handle = (*trifactors)->handle) { 3829 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3830 } 3831 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3832 } 3833 PetscFunctionReturn(0); 3834 } 3835 3836 struct IJCompare 3837 { 3838 __host__ __device__ 3839 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3840 { 3841 if (t1.get<0>() < t2.get<0>()) return true; 3842 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3843 return false; 3844 } 3845 }; 3846 3847 struct IJEqual 3848 { 3849 __host__ __device__ 3850 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3851 { 3852 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3853 return true; 3854 } 3855 }; 3856 3857 struct IJDiff 3858 { 3859 __host__ __device__ 3860 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3861 { 3862 return t1 == t2 ? 0 : 1; 3863 } 3864 }; 3865 3866 struct IJSum 3867 { 3868 __host__ __device__ 3869 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3870 { 3871 return t1||t2; 3872 } 3873 }; 3874 3875 #include <thrust/iterator/discard_iterator.h> 3876 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3877 { 3878 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3879 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3880 THRUSTARRAY *cooPerm_v = NULL; 3881 thrust::device_ptr<const PetscScalar> d_v; 3882 CsrMatrix *matrix; 3883 PetscErrorCode ierr; 3884 PetscInt n; 3885 3886 PetscFunctionBegin; 3887 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3888 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3889 if (!cusp->cooPerm) { 3890 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3891 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3892 PetscFunctionReturn(0); 3893 } 3894 matrix = (CsrMatrix*)cusp->mat->mat; 3895 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3896 if (!v) { 3897 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3898 goto finalize; 3899 } 3900 n = cusp->cooPerm->size(); 3901 if (isCudaMem(v)) { 3902 d_v = thrust::device_pointer_cast(v); 3903 } else { 3904 cooPerm_v = new THRUSTARRAY(n); 3905 cooPerm_v->assign(v,v+n); 3906 d_v = cooPerm_v->data(); 3907 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3908 } 3909 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3910 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3911 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3912 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3913 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3914 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3915 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3916 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3917 */ 3918 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3919 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3920 delete cooPerm_w; 3921 } else { 3922 /* all nonzeros in d_v[] are unique entries */ 3923 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3924 matrix->values->begin())); 3925 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3926 matrix->values->end())); 3927 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3928 } 3929 } else { 3930 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3931 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3932 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3933 } else { 3934 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3935 matrix->values->begin())); 3936 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3937 matrix->values->end())); 3938 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3939 } 3940 } 3941 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3942 finalize: 3943 delete cooPerm_v; 3944 A->offloadmask = PETSC_OFFLOAD_GPU; 3945 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3946 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3947 ierr = PetscInfo3(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3948 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3949 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr); 3950 a->reallocs = 0; 3951 A->info.mallocs += 0; 3952 A->info.nz_unneeded = 0; 3953 A->assembled = A->was_assembled = PETSC_TRUE; 3954 A->num_ass++; 3955 PetscFunctionReturn(0); 3956 } 3957 3958 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3959 { 3960 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3961 PetscErrorCode ierr; 3962 3963 PetscFunctionBegin; 3964 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3965 if (!cusp) PetscFunctionReturn(0); 3966 if (destroy) { 3967 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3968 delete cusp->csr2csc_i; 3969 cusp->csr2csc_i = NULL; 3970 } 3971 A->transupdated = PETSC_FALSE; 3972 PetscFunctionReturn(0); 3973 } 3974 3975 #include <thrust/binary_search.h> 3976 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3977 { 3978 PetscErrorCode ierr; 3979 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3980 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3981 PetscInt cooPerm_n, nzr = 0; 3982 cudaError_t cerr; 3983 3984 PetscFunctionBegin; 3985 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3986 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3987 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3988 if (n != cooPerm_n) { 3989 delete cusp->cooPerm; 3990 delete cusp->cooPerm_a; 3991 cusp->cooPerm = NULL; 3992 cusp->cooPerm_a = NULL; 3993 } 3994 if (n) { 3995 THRUSTINTARRAY d_i(n); 3996 THRUSTINTARRAY d_j(n); 3997 THRUSTINTARRAY ii(A->rmap->n); 3998 3999 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 4000 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 4001 4002 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 4003 d_i.assign(coo_i,coo_i+n); 4004 d_j.assign(coo_j,coo_j+n); 4005 4006 /* Ex. 4007 n = 6 4008 coo_i = [3,3,1,4,1,4] 4009 coo_j = [3,2,2,5,2,6] 4010 */ 4011 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4012 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4013 4014 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4015 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4016 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4017 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4018 THRUSTINTARRAY w = d_j; 4019 4020 /* 4021 d_i = [1,1,3,3,4,4] 4022 d_j = [2,2,2,3,5,6] 4023 cooPerm = [2,4,1,0,3,5] 4024 */ 4025 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4026 4027 /* 4028 d_i = [1,3,3,4,4,x] 4029 ^ekey 4030 d_j = [2,2,3,5,6,x] 4031 ^nekye 4032 */ 4033 if (nekey == ekey) { /* all entries are unique */ 4034 delete cusp->cooPerm_a; 4035 cusp->cooPerm_a = NULL; 4036 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4037 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4038 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4039 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4040 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4041 w[0] = 0; 4042 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4043 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4044 } 4045 thrust::counting_iterator<PetscInt> search_begin(0); 4046 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4047 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4048 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4049 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4050 4051 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 4052 a->singlemalloc = PETSC_FALSE; 4053 a->free_a = PETSC_TRUE; 4054 a->free_ij = PETSC_TRUE; 4055 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4056 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4057 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4058 a->nz = a->maxnz = a->i[A->rmap->n]; 4059 a->rmax = 0; 4060 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4061 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4062 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4063 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4064 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4065 for (PetscInt i = 0; i < A->rmap->n; i++) { 4066 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4067 nzr += (PetscInt)!!(nnzr); 4068 a->ilen[i] = a->imax[i] = nnzr; 4069 a->rmax = PetscMax(a->rmax,nnzr); 4070 } 4071 a->nonzerorowcnt = nzr; 4072 A->preallocated = PETSC_TRUE; 4073 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4074 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4075 } else { 4076 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4077 } 4078 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4079 4080 /* We want to allocate the CUSPARSE struct for matvec now. 4081 The code is so convoluted now that I prefer to copy zeros */ 4082 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4083 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4084 A->offloadmask = PETSC_OFFLOAD_CPU; 4085 A->nonzerostate++; 4086 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4087 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4088 4089 A->assembled = PETSC_FALSE; 4090 A->was_assembled = PETSC_FALSE; 4091 PetscFunctionReturn(0); 4092 } 4093 4094 /*@C 4095 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4096 4097 Not collective 4098 4099 Input Parameters: 4100 + A - the matrix 4101 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4102 4103 Output Parameters: 4104 + ia - the CSR row pointers 4105 - ja - the CSR column indices 4106 4107 Level: developer 4108 4109 Notes: 4110 When compressed is true, the CSR structure does not contain empty rows 4111 4112 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4113 @*/ 4114 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4115 { 4116 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4117 CsrMatrix *csr; 4118 PetscErrorCode ierr; 4119 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4120 4121 PetscFunctionBegin; 4122 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4123 if (!i || !j) PetscFunctionReturn(0); 4124 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4125 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4126 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4127 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4128 csr = (CsrMatrix*)cusp->mat->mat; 4129 if (i) { 4130 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4131 if (!cusp->rowoffsets_gpu) { 4132 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4133 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4134 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4135 } 4136 *i = cusp->rowoffsets_gpu->data().get(); 4137 } else *i = csr->row_offsets->data().get(); 4138 } 4139 if (j) *j = csr->column_indices->data().get(); 4140 PetscFunctionReturn(0); 4141 } 4142 4143 /*@C 4144 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4145 4146 Not collective 4147 4148 Input Parameters: 4149 + A - the matrix 4150 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4151 4152 Output Parameters: 4153 + ia - the CSR row pointers 4154 - ja - the CSR column indices 4155 4156 Level: developer 4157 4158 .seealso: MatSeqAIJCUSPARSEGetIJ() 4159 @*/ 4160 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4161 { 4162 PetscFunctionBegin; 4163 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4164 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4165 if (i) *i = NULL; 4166 if (j) *j = NULL; 4167 PetscFunctionReturn(0); 4168 } 4169 4170 /*@C 4171 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4172 4173 Not Collective 4174 4175 Input Parameter: 4176 . A - a MATSEQAIJCUSPARSE matrix 4177 4178 Output Parameter: 4179 . a - pointer to the device data 4180 4181 Level: developer 4182 4183 Notes: may trigger host-device copies if up-to-date matrix data is on host 4184 4185 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4186 @*/ 4187 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4188 { 4189 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4190 CsrMatrix *csr; 4191 PetscErrorCode ierr; 4192 4193 PetscFunctionBegin; 4194 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4195 PetscValidPointer(a,2); 4196 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4197 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4198 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4199 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4200 csr = (CsrMatrix*)cusp->mat->mat; 4201 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4202 *a = csr->values->data().get(); 4203 PetscFunctionReturn(0); 4204 } 4205 4206 /*@C 4207 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4208 4209 Not Collective 4210 4211 Input Parameter: 4212 . A - a MATSEQAIJCUSPARSE matrix 4213 4214 Output Parameter: 4215 . a - pointer to the device data 4216 4217 Level: developer 4218 4219 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4220 @*/ 4221 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4222 { 4223 PetscFunctionBegin; 4224 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4225 PetscValidPointer(a,2); 4226 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4227 *a = NULL; 4228 PetscFunctionReturn(0); 4229 } 4230 4231 /*@C 4232 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4233 4234 Not Collective 4235 4236 Input Parameter: 4237 . A - a MATSEQAIJCUSPARSE matrix 4238 4239 Output Parameter: 4240 . a - pointer to the device data 4241 4242 Level: developer 4243 4244 Notes: may trigger host-device copies if up-to-date matrix data is on host 4245 4246 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4247 @*/ 4248 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4249 { 4250 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4251 CsrMatrix *csr; 4252 PetscErrorCode ierr; 4253 4254 PetscFunctionBegin; 4255 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4256 PetscValidPointer(a,2); 4257 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4258 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4259 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4260 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4261 csr = (CsrMatrix*)cusp->mat->mat; 4262 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4263 *a = csr->values->data().get(); 4264 A->offloadmask = PETSC_OFFLOAD_GPU; 4265 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4266 PetscFunctionReturn(0); 4267 } 4268 /*@C 4269 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4270 4271 Not Collective 4272 4273 Input Parameter: 4274 . A - a MATSEQAIJCUSPARSE matrix 4275 4276 Output Parameter: 4277 . a - pointer to the device data 4278 4279 Level: developer 4280 4281 .seealso: MatSeqAIJCUSPARSEGetArray() 4282 @*/ 4283 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4284 { 4285 PetscErrorCode ierr; 4286 4287 PetscFunctionBegin; 4288 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4289 PetscValidPointer(a,2); 4290 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4291 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4292 *a = NULL; 4293 PetscFunctionReturn(0); 4294 } 4295 4296 /*@C 4297 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4298 4299 Not Collective 4300 4301 Input Parameter: 4302 . A - a MATSEQAIJCUSPARSE matrix 4303 4304 Output Parameter: 4305 . a - pointer to the device data 4306 4307 Level: developer 4308 4309 Notes: does not trigger host-device copies and flags data validity on the GPU 4310 4311 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4312 @*/ 4313 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4314 { 4315 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4316 CsrMatrix *csr; 4317 PetscErrorCode ierr; 4318 4319 PetscFunctionBegin; 4320 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4321 PetscValidPointer(a,2); 4322 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4323 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4324 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4325 csr = (CsrMatrix*)cusp->mat->mat; 4326 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4327 *a = csr->values->data().get(); 4328 A->offloadmask = PETSC_OFFLOAD_GPU; 4329 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4330 PetscFunctionReturn(0); 4331 } 4332 4333 /*@C 4334 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4335 4336 Not Collective 4337 4338 Input Parameter: 4339 . A - a MATSEQAIJCUSPARSE matrix 4340 4341 Output Parameter: 4342 . a - pointer to the device data 4343 4344 Level: developer 4345 4346 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4347 @*/ 4348 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4349 { 4350 PetscErrorCode ierr; 4351 4352 PetscFunctionBegin; 4353 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4354 PetscValidPointer(a,2); 4355 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4356 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4357 *a = NULL; 4358 PetscFunctionReturn(0); 4359 } 4360 4361 struct IJCompare4 4362 { 4363 __host__ __device__ 4364 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4365 { 4366 if (t1.get<0>() < t2.get<0>()) return true; 4367 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4368 return false; 4369 } 4370 }; 4371 4372 struct Shift 4373 { 4374 int _shift; 4375 4376 Shift(int shift) : _shift(shift) {} 4377 __host__ __device__ 4378 inline int operator() (const int &c) 4379 { 4380 return c + _shift; 4381 } 4382 }; 4383 4384 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4385 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4386 { 4387 PetscErrorCode ierr; 4388 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4389 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4390 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4391 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4392 PetscInt Annz,Bnnz; 4393 cusparseStatus_t stat; 4394 PetscInt i,m,n,zero = 0; 4395 cudaError_t cerr; 4396 4397 PetscFunctionBegin; 4398 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4399 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4400 PetscValidPointer(C,4); 4401 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4402 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4403 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4404 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4405 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4406 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4407 if (reuse == MAT_INITIAL_MATRIX) { 4408 m = A->rmap->n; 4409 n = A->cmap->n + B->cmap->n; 4410 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4411 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4412 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4413 c = (Mat_SeqAIJ*)(*C)->data; 4414 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4415 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4416 Ccsr = new CsrMatrix; 4417 Cmat->cprowIndices = NULL; 4418 c->compressedrow.use = PETSC_FALSE; 4419 c->compressedrow.nrows = 0; 4420 c->compressedrow.i = NULL; 4421 c->compressedrow.rindex = NULL; 4422 Ccusp->workVector = NULL; 4423 Ccusp->nrows = m; 4424 Ccusp->mat = Cmat; 4425 Ccusp->mat->mat = Ccsr; 4426 Ccsr->num_rows = m; 4427 Ccsr->num_cols = n; 4428 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4429 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4430 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4431 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4432 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4433 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4434 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4435 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4436 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4437 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4438 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4439 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4440 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4441 4442 Acsr = (CsrMatrix*)Acusp->mat->mat; 4443 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4444 Annz = (PetscInt)Acsr->column_indices->size(); 4445 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4446 c->nz = Annz + Bnnz; 4447 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4448 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4449 Ccsr->values = new THRUSTARRAY(c->nz); 4450 Ccsr->num_entries = c->nz; 4451 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4452 if (c->nz) { 4453 auto Acoo = new THRUSTINTARRAY32(Annz); 4454 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4455 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4456 THRUSTINTARRAY32 *Aroff,*Broff; 4457 4458 if (a->compressedrow.use) { /* need full row offset */ 4459 if (!Acusp->rowoffsets_gpu) { 4460 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4461 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4462 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4463 } 4464 Aroff = Acusp->rowoffsets_gpu; 4465 } else Aroff = Acsr->row_offsets; 4466 if (b->compressedrow.use) { /* need full row offset */ 4467 if (!Bcusp->rowoffsets_gpu) { 4468 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4469 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4470 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4471 } 4472 Broff = Bcusp->rowoffsets_gpu; 4473 } else Broff = Bcsr->row_offsets; 4474 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4475 stat = cusparseXcsr2coo(Acusp->handle, 4476 Aroff->data().get(), 4477 Annz, 4478 m, 4479 Acoo->data().get(), 4480 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4481 stat = cusparseXcsr2coo(Bcusp->handle, 4482 Broff->data().get(), 4483 Bnnz, 4484 m, 4485 Bcoo->data().get(), 4486 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4487 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4488 auto Aperm = thrust::make_constant_iterator(1); 4489 auto Bperm = thrust::make_constant_iterator(0); 4490 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4491 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4492 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4493 #else 4494 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4495 auto Bcib = Bcsr->column_indices->begin(); 4496 auto Bcie = Bcsr->column_indices->end(); 4497 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4498 #endif 4499 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4500 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4501 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4502 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4503 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4504 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4505 auto p1 = Ccusp->cooPerm->begin(); 4506 auto p2 = Ccusp->cooPerm->begin(); 4507 thrust::advance(p2,Annz); 4508 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4509 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4510 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4511 #endif 4512 auto cci = thrust::make_counting_iterator(zero); 4513 auto cce = thrust::make_counting_iterator(c->nz); 4514 #if 0 //Errors on SUMMIT cuda 11.1.0 4515 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4516 #else 4517 auto pred = thrust::identity<int>(); 4518 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4519 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4520 #endif 4521 stat = cusparseXcoo2csr(Ccusp->handle, 4522 Ccoo->data().get(), 4523 c->nz, 4524 m, 4525 Ccsr->row_offsets->data().get(), 4526 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4527 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4528 delete wPerm; 4529 delete Acoo; 4530 delete Bcoo; 4531 delete Ccoo; 4532 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4533 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4534 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4535 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4536 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4537 #endif 4538 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4539 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4540 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4541 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4542 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4543 CsrMatrix *CcsrT = new CsrMatrix; 4544 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4545 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4546 4547 (*C)->form_explicit_transpose = PETSC_TRUE; 4548 (*C)->transupdated = PETSC_TRUE; 4549 Ccusp->rowoffsets_gpu = NULL; 4550 CmatT->cprowIndices = NULL; 4551 CmatT->mat = CcsrT; 4552 CcsrT->num_rows = n; 4553 CcsrT->num_cols = m; 4554 CcsrT->num_entries = c->nz; 4555 4556 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4557 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4558 CcsrT->values = new THRUSTARRAY(c->nz); 4559 4560 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4561 auto rT = CcsrT->row_offsets->begin(); 4562 if (AT) { 4563 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4564 thrust::advance(rT,-1); 4565 } 4566 if (BT) { 4567 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4568 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4569 thrust::copy(titb,tite,rT); 4570 } 4571 auto cT = CcsrT->column_indices->begin(); 4572 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4573 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4574 auto vT = CcsrT->values->begin(); 4575 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4576 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4577 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4578 4579 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4580 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4581 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4582 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4583 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4584 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4585 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4586 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4587 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4588 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4589 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4590 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4591 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4592 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4593 #endif 4594 Ccusp->matTranspose = CmatT; 4595 } 4596 } 4597 4598 c->singlemalloc = PETSC_FALSE; 4599 c->free_a = PETSC_TRUE; 4600 c->free_ij = PETSC_TRUE; 4601 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4602 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4603 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4604 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4605 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4606 ii = *Ccsr->row_offsets; 4607 jj = *Ccsr->column_indices; 4608 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4609 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4610 } else { 4611 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4612 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4613 } 4614 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4615 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4616 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4617 c->maxnz = c->nz; 4618 c->nonzerorowcnt = 0; 4619 c->rmax = 0; 4620 for (i = 0; i < m; i++) { 4621 const PetscInt nn = c->i[i+1] - c->i[i]; 4622 c->ilen[i] = c->imax[i] = nn; 4623 c->nonzerorowcnt += (PetscInt)!!nn; 4624 c->rmax = PetscMax(c->rmax,nn); 4625 } 4626 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4627 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4628 (*C)->nonzerostate++; 4629 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4630 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4631 Ccusp->nonzerostate = (*C)->nonzerostate; 4632 (*C)->preallocated = PETSC_TRUE; 4633 } else { 4634 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4635 c = (Mat_SeqAIJ*)(*C)->data; 4636 if (c->nz) { 4637 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4638 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4639 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4640 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4641 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4642 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4643 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4644 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4645 Acsr = (CsrMatrix*)Acusp->mat->mat; 4646 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4647 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4648 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4649 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4650 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4651 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4652 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4653 auto pmid = Ccusp->cooPerm->begin(); 4654 thrust::advance(pmid,Acsr->num_entries); 4655 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4656 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4657 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4658 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4659 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4660 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4661 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4662 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4663 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4664 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4665 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4666 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4667 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4668 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4669 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4670 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4671 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4672 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4673 auto vT = CcsrT->values->begin(); 4674 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4675 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4676 (*C)->transupdated = PETSC_TRUE; 4677 } 4678 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4679 } 4680 } 4681 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4682 (*C)->assembled = PETSC_TRUE; 4683 (*C)->was_assembled = PETSC_FALSE; 4684 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4685 PetscFunctionReturn(0); 4686 } 4687 4688 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4689 { 4690 PetscErrorCode ierr; 4691 bool dmem; 4692 const PetscScalar *av; 4693 cudaError_t cerr; 4694 4695 PetscFunctionBegin; 4696 dmem = isCudaMem(v); 4697 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4698 if (n && idx) { 4699 THRUSTINTARRAY widx(n); 4700 widx.assign(idx,idx+n); 4701 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4702 4703 THRUSTARRAY *w = NULL; 4704 thrust::device_ptr<PetscScalar> dv; 4705 if (dmem) { 4706 dv = thrust::device_pointer_cast(v); 4707 } else { 4708 w = new THRUSTARRAY(n); 4709 dv = w->data(); 4710 } 4711 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4712 4713 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4714 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4715 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4716 if (w) { 4717 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4718 } 4719 delete w; 4720 } else { 4721 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4722 } 4723 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4724 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4725 PetscFunctionReturn(0); 4726 } 4727