1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 93 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 94 { 95 cusparseStatus_t stat; 96 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97 98 PetscFunctionBegin; 99 PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 100 cusparsestruct->stream = stream; 101 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 102 PetscFunctionReturn(0); 103 } 104 105 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 106 { 107 cusparseStatus_t stat; 108 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 109 110 PetscFunctionBegin; 111 PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 112 if (cusparsestruct->handle != handle) { 113 if (cusparsestruct->handle) { 114 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 115 } 116 cusparsestruct->handle = handle; 117 } 118 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 PetscBool flg; 126 PetscErrorCode ierr; 127 128 PetscFunctionBegin; 129 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 130 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 131 if (cusparsestruct->handle) cusparsestruct->handle = 0; 132 PetscFunctionReturn(0); 133 } 134 135 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 136 { 137 PetscFunctionBegin; 138 *type = MATSOLVERCUSPARSE; 139 PetscFunctionReturn(0); 140 } 141 142 /*MC 143 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 144 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 145 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 146 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 147 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 148 algorithms are not recommended. This class does NOT support direct solver operations. 149 150 Level: beginner 151 152 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 153 M*/ 154 155 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 156 { 157 PetscErrorCode ierr; 158 PetscInt n = A->rmap->n; 159 160 PetscFunctionBegin; 161 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 162 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 163 (*B)->factortype = ftype; 164 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 165 166 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 167 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 168 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 169 if (!A->boundtocpu) { 170 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 171 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 172 } else { 173 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 174 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 175 } 176 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 178 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 179 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 180 if (!A->boundtocpu) { 181 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 182 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 183 } else { 184 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 185 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 186 } 187 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 188 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 189 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 190 191 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 192 (*B)->canuseordering = PETSC_TRUE; 193 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 194 PetscFunctionReturn(0); 195 } 196 197 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 198 { 199 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 200 201 PetscFunctionBegin; 202 switch (op) { 203 case MAT_CUSPARSE_MULT: 204 cusparsestruct->format = format; 205 break; 206 case MAT_CUSPARSE_ALL: 207 cusparsestruct->format = format; 208 break; 209 default: 210 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 211 } 212 PetscFunctionReturn(0); 213 } 214 215 /*@ 216 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 217 operation. Only the MatMult operation can use different GPU storage formats 218 for MPIAIJCUSPARSE matrices. 219 Not Collective 220 221 Input Parameters: 222 + A - Matrix of type SEQAIJCUSPARSE 223 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 224 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 225 226 Output Parameter: 227 228 Level: intermediate 229 230 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 231 @*/ 232 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 233 { 234 PetscErrorCode ierr; 235 236 PetscFunctionBegin; 237 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 238 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 243 { 244 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 245 246 PetscFunctionBegin; 247 cusparsestruct->use_cpu_solve = use_cpu; 248 PetscFunctionReturn(0); 249 } 250 251 /*@ 252 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 253 254 Input Parameters: 255 + A - Matrix of type SEQAIJCUSPARSE 256 - use_cpu - set flag for using the built-in CPU MatSolve 257 258 Output Parameter: 259 260 Notes: 261 The cuSparse LU solver currently computes the factors with the built-in CPU method 262 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 263 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 264 265 Level: intermediate 266 267 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 268 @*/ 269 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 270 { 271 PetscErrorCode ierr; 272 273 PetscFunctionBegin; 274 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 275 ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 276 PetscFunctionReturn(0); 277 } 278 279 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 280 { 281 PetscErrorCode ierr; 282 283 PetscFunctionBegin; 284 switch (op) { 285 case MAT_FORM_EXPLICIT_TRANSPOSE: 286 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 287 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 288 A->form_explicit_transpose = flg; 289 break; 290 default: 291 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 292 break; 293 } 294 PetscFunctionReturn(0); 295 } 296 297 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 298 299 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 300 { 301 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 302 IS isrow = b->row,iscol = b->col; 303 PetscBool row_identity,col_identity; 304 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 305 PetscErrorCode ierr; 306 307 PetscFunctionBegin; 308 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 309 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 310 B->offloadmask = PETSC_OFFLOAD_CPU; 311 /* determine which version of MatSolve needs to be used. */ 312 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 313 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 314 if (row_identity && col_identity) { 315 if (!cusparsestruct->use_cpu_solve) { 316 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 317 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 318 } 319 B->ops->matsolve = NULL; 320 B->ops->matsolvetranspose = NULL; 321 } else { 322 if (!cusparsestruct->use_cpu_solve) { 323 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 324 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 325 } 326 B->ops->matsolve = NULL; 327 B->ops->matsolvetranspose = NULL; 328 } 329 330 /* get the triangular factors */ 331 if (!cusparsestruct->use_cpu_solve) { 332 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 333 } 334 PetscFunctionReturn(0); 335 } 336 337 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 338 { 339 PetscErrorCode ierr; 340 MatCUSPARSEStorageFormat format; 341 PetscBool flg; 342 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 343 344 PetscFunctionBegin; 345 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 346 if (A->factortype == MAT_FACTOR_NONE) { 347 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 348 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 349 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 350 351 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 352 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 353 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 354 ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 355 if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 356 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 357 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 358 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 359 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 360 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 361 PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 362 #else 363 PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 364 #endif 365 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 366 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 367 PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 368 369 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 370 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 371 PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 372 #endif 373 } 374 ierr = PetscOptionsTail();CHKERRQ(ierr); 375 PetscFunctionReturn(0); 376 } 377 378 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 379 { 380 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 381 PetscErrorCode ierr; 382 383 PetscFunctionBegin; 384 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 385 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 386 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 387 PetscFunctionReturn(0); 388 } 389 390 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 391 { 392 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 393 PetscErrorCode ierr; 394 395 PetscFunctionBegin; 396 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 397 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 398 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 399 PetscFunctionReturn(0); 400 } 401 402 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 403 { 404 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 405 PetscErrorCode ierr; 406 407 PetscFunctionBegin; 408 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 409 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 410 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 411 PetscFunctionReturn(0); 412 } 413 414 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 415 { 416 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 417 PetscErrorCode ierr; 418 419 PetscFunctionBegin; 420 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 421 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 422 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 423 PetscFunctionReturn(0); 424 } 425 426 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 427 { 428 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 429 PetscInt n = A->rmap->n; 430 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 431 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 432 cusparseStatus_t stat; 433 const PetscInt *ai = a->i,*aj = a->j,*vi; 434 const MatScalar *aa = a->a,*v; 435 PetscInt *AiLo, *AjLo; 436 PetscInt i,nz, nzLower, offset, rowOffset; 437 PetscErrorCode ierr; 438 cudaError_t cerr; 439 440 PetscFunctionBegin; 441 if (!n) PetscFunctionReturn(0); 442 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 443 try { 444 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 445 nzLower=n+ai[n]-ai[1]; 446 if (!loTriFactor) { 447 PetscScalar *AALo; 448 449 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 450 451 /* Allocate Space for the lower triangular matrix */ 452 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 453 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 454 455 /* Fill the lower triangular matrix */ 456 AiLo[0] = (PetscInt) 0; 457 AiLo[n] = nzLower; 458 AjLo[0] = (PetscInt) 0; 459 AALo[0] = (MatScalar) 1.0; 460 v = aa; 461 vi = aj; 462 offset = 1; 463 rowOffset= 1; 464 for (i=1; i<n; i++) { 465 nz = ai[i+1] - ai[i]; 466 /* additional 1 for the term on the diagonal */ 467 AiLo[i] = rowOffset; 468 rowOffset += nz+1; 469 470 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 471 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 472 473 offset += nz; 474 AjLo[offset] = (PetscInt) i; 475 AALo[offset] = (MatScalar) 1.0; 476 offset += 1; 477 478 v += nz; 479 vi += nz; 480 } 481 482 /* allocate space for the triangular factor information */ 483 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 484 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 485 /* Create the matrix description */ 486 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 487 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 488 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 489 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 490 #else 491 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 492 #endif 493 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 494 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 495 496 /* set the operation */ 497 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 498 499 /* set the matrix */ 500 loTriFactor->csrMat = new CsrMatrix; 501 loTriFactor->csrMat->num_rows = n; 502 loTriFactor->csrMat->num_cols = n; 503 loTriFactor->csrMat->num_entries = nzLower; 504 505 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 506 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 507 508 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 509 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 510 511 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 512 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 513 514 /* Create the solve analysis information */ 515 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 516 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 517 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 518 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 519 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 520 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 521 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 522 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 523 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 524 #endif 525 526 /* perform the solve analysis */ 527 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 528 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 529 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 530 loTriFactor->csrMat->column_indices->data().get(), 531 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 532 loTriFactor->solveInfo, 533 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 534 #else 535 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 536 #endif 537 cerr = WaitForCUDA();CHKERRCUDA(cerr); 538 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 539 540 /* assign the pointer */ 541 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 542 loTriFactor->AA_h = AALo; 543 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 544 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 545 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 546 } else { /* update values only */ 547 if (!loTriFactor->AA_h) { 548 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 549 } 550 /* Fill the lower triangular matrix */ 551 loTriFactor->AA_h[0] = 1.0; 552 v = aa; 553 vi = aj; 554 offset = 1; 555 for (i=1; i<n; i++) { 556 nz = ai[i+1] - ai[i]; 557 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 558 offset += nz; 559 loTriFactor->AA_h[offset] = 1.0; 560 offset += 1; 561 v += nz; 562 } 563 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 564 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 565 } 566 } catch(char *ex) { 567 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 568 } 569 } 570 PetscFunctionReturn(0); 571 } 572 573 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 574 { 575 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 576 PetscInt n = A->rmap->n; 577 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 578 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 579 cusparseStatus_t stat; 580 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 581 const MatScalar *aa = a->a,*v; 582 PetscInt *AiUp, *AjUp; 583 PetscInt i,nz, nzUpper, offset; 584 PetscErrorCode ierr; 585 cudaError_t cerr; 586 587 PetscFunctionBegin; 588 if (!n) PetscFunctionReturn(0); 589 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 590 try { 591 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 592 nzUpper = adiag[0]-adiag[n]; 593 if (!upTriFactor) { 594 PetscScalar *AAUp; 595 596 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 597 598 /* Allocate Space for the upper triangular matrix */ 599 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 600 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 601 602 /* Fill the upper triangular matrix */ 603 AiUp[0]=(PetscInt) 0; 604 AiUp[n]=nzUpper; 605 offset = nzUpper; 606 for (i=n-1; i>=0; i--) { 607 v = aa + adiag[i+1] + 1; 608 vi = aj + adiag[i+1] + 1; 609 610 /* number of elements NOT on the diagonal */ 611 nz = adiag[i] - adiag[i+1]-1; 612 613 /* decrement the offset */ 614 offset -= (nz+1); 615 616 /* first, set the diagonal elements */ 617 AjUp[offset] = (PetscInt) i; 618 AAUp[offset] = (MatScalar)1./v[nz]; 619 AiUp[i] = AiUp[i+1] - (nz+1); 620 621 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 622 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 623 } 624 625 /* allocate space for the triangular factor information */ 626 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 627 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 628 629 /* Create the matrix description */ 630 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 631 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 632 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 633 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 634 #else 635 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 636 #endif 637 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 638 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 639 640 /* set the operation */ 641 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 642 643 /* set the matrix */ 644 upTriFactor->csrMat = new CsrMatrix; 645 upTriFactor->csrMat->num_rows = n; 646 upTriFactor->csrMat->num_cols = n; 647 upTriFactor->csrMat->num_entries = nzUpper; 648 649 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 650 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 651 652 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 653 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 654 655 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 656 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 657 658 /* Create the solve analysis information */ 659 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 660 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 661 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 662 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 663 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 664 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 665 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 666 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 667 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 668 #endif 669 670 /* perform the solve analysis */ 671 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 672 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 673 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 674 upTriFactor->csrMat->column_indices->data().get(), 675 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 676 upTriFactor->solveInfo, 677 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 678 #else 679 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 680 #endif 681 cerr = WaitForCUDA();CHKERRCUDA(cerr); 682 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 683 684 /* assign the pointer */ 685 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 686 upTriFactor->AA_h = AAUp; 687 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 688 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 689 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 690 } else { 691 if (!upTriFactor->AA_h) { 692 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 693 } 694 /* Fill the upper triangular matrix */ 695 offset = nzUpper; 696 for (i=n-1; i>=0; i--) { 697 v = aa + adiag[i+1] + 1; 698 699 /* number of elements NOT on the diagonal */ 700 nz = adiag[i] - adiag[i+1]-1; 701 702 /* decrement the offset */ 703 offset -= (nz+1); 704 705 /* first, set the diagonal elements */ 706 upTriFactor->AA_h[offset] = 1./v[nz]; 707 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 708 } 709 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 710 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 711 } 712 } catch(char *ex) { 713 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 714 } 715 } 716 PetscFunctionReturn(0); 717 } 718 719 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 720 { 721 PetscErrorCode ierr; 722 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 723 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 724 IS isrow = a->row,iscol = a->icol; 725 PetscBool row_identity,col_identity; 726 PetscInt n = A->rmap->n; 727 728 PetscFunctionBegin; 729 PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 730 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 731 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 732 733 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 734 cusparseTriFactors->nnz=a->nz; 735 736 A->offloadmask = PETSC_OFFLOAD_BOTH; 737 /* lower triangular indices */ 738 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 739 if (!row_identity && !cusparseTriFactors->rpermIndices) { 740 const PetscInt *r; 741 742 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 743 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 744 cusparseTriFactors->rpermIndices->assign(r, r+n); 745 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 746 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 747 } 748 749 /* upper triangular indices */ 750 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 751 if (!col_identity && !cusparseTriFactors->cpermIndices) { 752 const PetscInt *c; 753 754 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 755 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 756 cusparseTriFactors->cpermIndices->assign(c, c+n); 757 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 758 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 759 } 760 PetscFunctionReturn(0); 761 } 762 763 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 764 { 765 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 766 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 767 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 768 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 769 cusparseStatus_t stat; 770 PetscErrorCode ierr; 771 cudaError_t cerr; 772 PetscInt *AiUp, *AjUp; 773 PetscScalar *AAUp; 774 PetscScalar *AALo; 775 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 776 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 777 const PetscInt *ai = b->i,*aj = b->j,*vj; 778 const MatScalar *aa = b->a,*v; 779 780 PetscFunctionBegin; 781 if (!n) PetscFunctionReturn(0); 782 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 783 try { 784 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 785 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 786 if (!upTriFactor && !loTriFactor) { 787 /* Allocate Space for the upper triangular matrix */ 788 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 789 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 790 791 /* Fill the upper triangular matrix */ 792 AiUp[0]=(PetscInt) 0; 793 AiUp[n]=nzUpper; 794 offset = 0; 795 for (i=0; i<n; i++) { 796 /* set the pointers */ 797 v = aa + ai[i]; 798 vj = aj + ai[i]; 799 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 800 801 /* first, set the diagonal elements */ 802 AjUp[offset] = (PetscInt) i; 803 AAUp[offset] = (MatScalar)1.0/v[nz]; 804 AiUp[i] = offset; 805 AALo[offset] = (MatScalar)1.0/v[nz]; 806 807 offset+=1; 808 if (nz>0) { 809 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 810 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 811 for (j=offset; j<offset+nz; j++) { 812 AAUp[j] = -AAUp[j]; 813 AALo[j] = AAUp[j]/v[nz]; 814 } 815 offset+=nz; 816 } 817 } 818 819 /* allocate space for the triangular factor information */ 820 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 821 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822 823 /* Create the matrix description */ 824 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 825 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 826 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 827 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 828 #else 829 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 830 #endif 831 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 832 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 833 834 /* set the matrix */ 835 upTriFactor->csrMat = new CsrMatrix; 836 upTriFactor->csrMat->num_rows = A->rmap->n; 837 upTriFactor->csrMat->num_cols = A->cmap->n; 838 upTriFactor->csrMat->num_entries = a->nz; 839 840 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 841 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 842 843 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 844 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 845 846 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 847 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 848 849 /* set the operation */ 850 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 851 852 /* Create the solve analysis information */ 853 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 854 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 855 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 856 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 857 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 858 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 859 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 860 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 861 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 862 #endif 863 864 /* perform the solve analysis */ 865 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 866 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 867 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 868 upTriFactor->csrMat->column_indices->data().get(), 869 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 870 upTriFactor->solveInfo, 871 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 872 #else 873 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 874 #endif 875 cerr = WaitForCUDA();CHKERRCUDA(cerr); 876 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 877 878 /* assign the pointer */ 879 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 880 881 /* allocate space for the triangular factor information */ 882 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 883 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 884 885 /* Create the matrix description */ 886 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 887 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 888 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 889 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 890 #else 891 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 892 #endif 893 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 894 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 895 896 /* set the operation */ 897 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 898 899 /* set the matrix */ 900 loTriFactor->csrMat = new CsrMatrix; 901 loTriFactor->csrMat->num_rows = A->rmap->n; 902 loTriFactor->csrMat->num_cols = A->cmap->n; 903 loTriFactor->csrMat->num_entries = a->nz; 904 905 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 906 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 907 908 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 909 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 910 911 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 912 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 913 914 /* Create the solve analysis information */ 915 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 916 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 917 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 918 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 919 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 920 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 921 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 922 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 923 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 924 #endif 925 926 /* perform the solve analysis */ 927 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 928 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 929 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 930 loTriFactor->csrMat->column_indices->data().get(), 931 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 932 loTriFactor->solveInfo, 933 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 934 #else 935 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 936 #endif 937 cerr = WaitForCUDA();CHKERRCUDA(cerr); 938 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 939 940 /* assign the pointer */ 941 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 942 943 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 944 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 945 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 946 } else { 947 /* Fill the upper triangular matrix */ 948 offset = 0; 949 for (i=0; i<n; i++) { 950 /* set the pointers */ 951 v = aa + ai[i]; 952 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 953 954 /* first, set the diagonal elements */ 955 AAUp[offset] = 1.0/v[nz]; 956 AALo[offset] = 1.0/v[nz]; 957 958 offset+=1; 959 if (nz>0) { 960 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 961 for (j=offset; j<offset+nz; j++) { 962 AAUp[j] = -AAUp[j]; 963 AALo[j] = AAUp[j]/v[nz]; 964 } 965 offset+=nz; 966 } 967 } 968 PetscCheckFalse(!upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 969 PetscCheckFalse(!loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 970 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 971 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 972 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 973 } 974 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 975 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 976 } catch(char *ex) { 977 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 978 } 979 } 980 PetscFunctionReturn(0); 981 } 982 983 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 984 { 985 PetscErrorCode ierr; 986 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 987 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 988 IS ip = a->row; 989 PetscBool perm_identity; 990 PetscInt n = A->rmap->n; 991 992 PetscFunctionBegin; 993 PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 994 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 995 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 996 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 997 998 A->offloadmask = PETSC_OFFLOAD_BOTH; 999 1000 /* lower triangular indices */ 1001 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1002 if (!perm_identity) { 1003 IS iip; 1004 const PetscInt *irip,*rip; 1005 1006 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 1007 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1008 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1009 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1010 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1011 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1012 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 1013 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 1014 ierr = ISDestroy(&iip);CHKERRQ(ierr); 1015 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1016 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1017 } 1018 PetscFunctionReturn(0); 1019 } 1020 1021 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1022 { 1023 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1024 IS ip = b->row; 1025 PetscBool perm_identity; 1026 PetscErrorCode ierr; 1027 1028 PetscFunctionBegin; 1029 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1030 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1031 B->offloadmask = PETSC_OFFLOAD_CPU; 1032 /* determine which version of MatSolve needs to be used. */ 1033 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1034 if (perm_identity) { 1035 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1036 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1037 B->ops->matsolve = NULL; 1038 B->ops->matsolvetranspose = NULL; 1039 } else { 1040 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1041 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1042 B->ops->matsolve = NULL; 1043 B->ops->matsolvetranspose = NULL; 1044 } 1045 1046 /* get the triangular factors */ 1047 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1048 PetscFunctionReturn(0); 1049 } 1050 1051 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1052 { 1053 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1054 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1055 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1056 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1057 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1058 cusparseStatus_t stat; 1059 cusparseIndexBase_t indexBase; 1060 cusparseMatrixType_t matrixType; 1061 cusparseFillMode_t fillMode; 1062 cusparseDiagType_t diagType; 1063 cudaError_t cerr; 1064 PetscErrorCode ierr; 1065 1066 PetscFunctionBegin; 1067 /* allocate space for the transpose of the lower triangular factor */ 1068 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1069 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1070 1071 /* set the matrix descriptors of the lower triangular factor */ 1072 matrixType = cusparseGetMatType(loTriFactor->descr); 1073 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1074 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1075 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1076 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1077 1078 /* Create the matrix description */ 1079 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1080 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1081 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1082 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1083 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1084 1085 /* set the operation */ 1086 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1087 1088 /* allocate GPU space for the CSC of the lower triangular factor*/ 1089 loTriFactorT->csrMat = new CsrMatrix; 1090 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1091 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1092 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1093 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1094 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1095 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1096 1097 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1098 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1099 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1100 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1101 loTriFactor->csrMat->values->data().get(), 1102 loTriFactor->csrMat->row_offsets->data().get(), 1103 loTriFactor->csrMat->column_indices->data().get(), 1104 loTriFactorT->csrMat->values->data().get(), 1105 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1106 CUSPARSE_ACTION_NUMERIC,indexBase, 1107 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1108 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1109 #endif 1110 1111 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1112 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1113 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1114 loTriFactor->csrMat->values->data().get(), 1115 loTriFactor->csrMat->row_offsets->data().get(), 1116 loTriFactor->csrMat->column_indices->data().get(), 1117 loTriFactorT->csrMat->values->data().get(), 1118 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1119 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1120 CUSPARSE_ACTION_NUMERIC, indexBase, 1121 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1122 #else 1123 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1124 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1125 #endif 1126 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1127 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1128 1129 /* Create the solve analysis information */ 1130 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1131 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1132 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1133 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1134 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1135 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1136 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1137 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1138 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1139 #endif 1140 1141 /* perform the solve analysis */ 1142 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1143 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1144 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1145 loTriFactorT->csrMat->column_indices->data().get(), 1146 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1147 loTriFactorT->solveInfo, 1148 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1149 #else 1150 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1151 #endif 1152 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1153 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1154 1155 /* assign the pointer */ 1156 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1157 1158 /*********************************************/ 1159 /* Now the Transpose of the Upper Tri Factor */ 1160 /*********************************************/ 1161 1162 /* allocate space for the transpose of the upper triangular factor */ 1163 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1164 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1165 1166 /* set the matrix descriptors of the upper triangular factor */ 1167 matrixType = cusparseGetMatType(upTriFactor->descr); 1168 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1169 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1170 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1171 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1172 1173 /* Create the matrix description */ 1174 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1175 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1176 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1177 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1178 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1179 1180 /* set the operation */ 1181 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1182 1183 /* allocate GPU space for the CSC of the upper triangular factor*/ 1184 upTriFactorT->csrMat = new CsrMatrix; 1185 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1186 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1187 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1188 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1189 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1190 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1191 1192 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1193 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1194 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1195 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1196 upTriFactor->csrMat->values->data().get(), 1197 upTriFactor->csrMat->row_offsets->data().get(), 1198 upTriFactor->csrMat->column_indices->data().get(), 1199 upTriFactorT->csrMat->values->data().get(), 1200 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1201 CUSPARSE_ACTION_NUMERIC,indexBase, 1202 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1203 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1204 #endif 1205 1206 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1207 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1208 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1209 upTriFactor->csrMat->values->data().get(), 1210 upTriFactor->csrMat->row_offsets->data().get(), 1211 upTriFactor->csrMat->column_indices->data().get(), 1212 upTriFactorT->csrMat->values->data().get(), 1213 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1214 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1215 CUSPARSE_ACTION_NUMERIC, indexBase, 1216 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1217 #else 1218 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1219 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1220 #endif 1221 1222 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1223 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1224 1225 /* Create the solve analysis information */ 1226 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1227 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1228 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1229 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1230 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1231 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1232 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1233 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1234 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1235 #endif 1236 1237 /* perform the solve analysis */ 1238 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1239 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1240 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1241 upTriFactorT->csrMat->column_indices->data().get(), 1242 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1243 upTriFactorT->solveInfo, 1244 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1245 #else 1246 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1247 #endif 1248 1249 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1250 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1251 1252 /* assign the pointer */ 1253 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1254 PetscFunctionReturn(0); 1255 } 1256 1257 struct PetscScalarToPetscInt 1258 { 1259 __host__ __device__ 1260 PetscInt operator()(PetscScalar s) 1261 { 1262 return (PetscInt)PetscRealPart(s); 1263 } 1264 }; 1265 1266 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1267 { 1268 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1269 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1271 cusparseStatus_t stat; 1272 cusparseIndexBase_t indexBase; 1273 cudaError_t err; 1274 PetscErrorCode ierr; 1275 1276 PetscFunctionBegin; 1277 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1278 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1279 PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1280 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1281 PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1282 if (A->transupdated) PetscFunctionReturn(0); 1283 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1284 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1285 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1286 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1287 } 1288 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1289 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1290 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1291 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1292 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1293 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1294 1295 /* set alpha and beta */ 1296 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1297 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1298 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1299 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1300 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1301 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1302 1303 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1304 CsrMatrix *matrixT = new CsrMatrix; 1305 matstructT->mat = matrixT; 1306 matrixT->num_rows = A->cmap->n; 1307 matrixT->num_cols = A->rmap->n; 1308 matrixT->num_entries = a->nz; 1309 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1310 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1311 matrixT->values = new THRUSTARRAY(a->nz); 1312 1313 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1314 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1315 1316 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1317 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1318 stat = cusparseCreateCsr(&matstructT->matDescr, 1319 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1320 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1321 matrixT->values->data().get(), 1322 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1323 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1324 #else 1325 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1326 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1327 1328 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1329 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1330 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1331 */ 1332 if (matrixT->num_entries) { 1333 stat = cusparseCreateCsr(&matstructT->matDescr, 1334 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1335 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1336 matrixT->values->data().get(), 1337 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1338 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1339 1340 } else { 1341 matstructT->matDescr = NULL; 1342 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1343 } 1344 #endif 1345 #endif 1346 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1347 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1348 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1349 #else 1350 CsrMatrix *temp = new CsrMatrix; 1351 CsrMatrix *tempT = new CsrMatrix; 1352 /* First convert HYB to CSR */ 1353 temp->num_rows = A->rmap->n; 1354 temp->num_cols = A->cmap->n; 1355 temp->num_entries = a->nz; 1356 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1357 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1358 temp->values = new THRUSTARRAY(a->nz); 1359 1360 stat = cusparse_hyb2csr(cusparsestruct->handle, 1361 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1362 temp->values->data().get(), 1363 temp->row_offsets->data().get(), 1364 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1365 1366 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1367 tempT->num_rows = A->rmap->n; 1368 tempT->num_cols = A->cmap->n; 1369 tempT->num_entries = a->nz; 1370 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1371 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1372 tempT->values = new THRUSTARRAY(a->nz); 1373 1374 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1375 temp->num_cols, temp->num_entries, 1376 temp->values->data().get(), 1377 temp->row_offsets->data().get(), 1378 temp->column_indices->data().get(), 1379 tempT->values->data().get(), 1380 tempT->column_indices->data().get(), 1381 tempT->row_offsets->data().get(), 1382 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1383 1384 /* Last, convert CSC to HYB */ 1385 cusparseHybMat_t hybMat; 1386 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1387 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1388 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1389 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1390 matstructT->descr, tempT->values->data().get(), 1391 tempT->row_offsets->data().get(), 1392 tempT->column_indices->data().get(), 1393 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1394 1395 /* assign the pointer */ 1396 matstructT->mat = hybMat; 1397 A->transupdated = PETSC_TRUE; 1398 /* delete temporaries */ 1399 if (tempT) { 1400 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1401 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1402 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1403 delete (CsrMatrix*) tempT; 1404 } 1405 if (temp) { 1406 if (temp->values) delete (THRUSTARRAY*) temp->values; 1407 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1408 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1409 delete (CsrMatrix*) temp; 1410 } 1411 #endif 1412 } 1413 } 1414 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1415 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1416 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1417 PetscCheckFalse(!matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1418 PetscCheckFalse(!matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1419 PetscCheckFalse(!matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1420 PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1421 PetscCheckFalse(!matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1422 PetscCheckFalse(!matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1423 PetscCheckFalse(!matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1424 PetscCheckFalse(!matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1425 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1426 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1427 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1428 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1429 } 1430 if (!cusparsestruct->csr2csc_i) { 1431 THRUSTARRAY csr2csc_a(matrix->num_entries); 1432 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1433 1434 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1435 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1436 void *csr2cscBuffer; 1437 size_t csr2cscBufferSize; 1438 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1439 A->cmap->n, matrix->num_entries, 1440 matrix->values->data().get(), 1441 cusparsestruct->rowoffsets_gpu->data().get(), 1442 matrix->column_indices->data().get(), 1443 matrixT->values->data().get(), 1444 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1445 CUSPARSE_ACTION_NUMERIC,indexBase, 1446 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1447 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1448 #endif 1449 1450 if (matrix->num_entries) { 1451 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1452 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1453 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1454 1455 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1456 should be filled with indexBase. So I just take a shortcut here. 1457 */ 1458 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1459 A->cmap->n,matrix->num_entries, 1460 csr2csc_a.data().get(), 1461 cusparsestruct->rowoffsets_gpu->data().get(), 1462 matrix->column_indices->data().get(), 1463 matrixT->values->data().get(), 1464 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1465 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1466 CUSPARSE_ACTION_NUMERIC,indexBase, 1467 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1468 #else 1469 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1470 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1471 #endif 1472 } else { 1473 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1474 } 1475 1476 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1477 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1478 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1479 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1480 #endif 1481 } 1482 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1483 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1484 matrixT->values->begin())); 1485 } 1486 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1487 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1488 /* the compressed row indices is not used for matTranspose */ 1489 matstructT->cprowIndices = NULL; 1490 /* assign the pointer */ 1491 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1492 A->transupdated = PETSC_TRUE; 1493 PetscFunctionReturn(0); 1494 } 1495 1496 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1497 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1498 { 1499 PetscInt n = xx->map->n; 1500 const PetscScalar *barray; 1501 PetscScalar *xarray; 1502 thrust::device_ptr<const PetscScalar> bGPU; 1503 thrust::device_ptr<PetscScalar> xGPU; 1504 cusparseStatus_t stat; 1505 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1506 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1507 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1508 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1509 PetscErrorCode ierr; 1510 1511 PetscFunctionBegin; 1512 /* Analyze the matrix and create the transpose ... on the fly */ 1513 if (!loTriFactorT && !upTriFactorT) { 1514 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1515 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1516 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1517 } 1518 1519 /* Get the GPU pointers */ 1520 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1521 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1522 xGPU = thrust::device_pointer_cast(xarray); 1523 bGPU = thrust::device_pointer_cast(barray); 1524 1525 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1526 /* First, reorder with the row permutation */ 1527 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1528 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1529 xGPU); 1530 1531 /* First, solve U */ 1532 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1533 upTriFactorT->csrMat->num_rows, 1534 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1535 upTriFactorT->csrMat->num_entries, 1536 #endif 1537 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1538 upTriFactorT->csrMat->values->data().get(), 1539 upTriFactorT->csrMat->row_offsets->data().get(), 1540 upTriFactorT->csrMat->column_indices->data().get(), 1541 upTriFactorT->solveInfo, 1542 xarray, 1543 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1544 tempGPU->data().get(), 1545 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1546 #else 1547 tempGPU->data().get());CHKERRCUSPARSE(stat); 1548 #endif 1549 1550 /* Then, solve L */ 1551 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1552 loTriFactorT->csrMat->num_rows, 1553 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1554 loTriFactorT->csrMat->num_entries, 1555 #endif 1556 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1557 loTriFactorT->csrMat->values->data().get(), 1558 loTriFactorT->csrMat->row_offsets->data().get(), 1559 loTriFactorT->csrMat->column_indices->data().get(), 1560 loTriFactorT->solveInfo, 1561 tempGPU->data().get(), 1562 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1563 xarray, 1564 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1565 #else 1566 xarray);CHKERRCUSPARSE(stat); 1567 #endif 1568 1569 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1570 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1571 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1572 tempGPU->begin()); 1573 1574 /* Copy the temporary to the full solution. */ 1575 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1576 1577 /* restore */ 1578 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1579 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1580 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1581 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1582 PetscFunctionReturn(0); 1583 } 1584 1585 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1586 { 1587 const PetscScalar *barray; 1588 PetscScalar *xarray; 1589 cusparseStatus_t stat; 1590 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1591 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1593 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1594 PetscErrorCode ierr; 1595 1596 PetscFunctionBegin; 1597 /* Analyze the matrix and create the transpose ... on the fly */ 1598 if (!loTriFactorT && !upTriFactorT) { 1599 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1600 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1601 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1602 } 1603 1604 /* Get the GPU pointers */ 1605 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1606 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1607 1608 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1609 /* First, solve U */ 1610 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1611 upTriFactorT->csrMat->num_rows, 1612 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1613 upTriFactorT->csrMat->num_entries, 1614 #endif 1615 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1616 upTriFactorT->csrMat->values->data().get(), 1617 upTriFactorT->csrMat->row_offsets->data().get(), 1618 upTriFactorT->csrMat->column_indices->data().get(), 1619 upTriFactorT->solveInfo, 1620 barray, 1621 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1622 tempGPU->data().get(), 1623 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1624 #else 1625 tempGPU->data().get());CHKERRCUSPARSE(stat); 1626 #endif 1627 1628 /* Then, solve L */ 1629 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1630 loTriFactorT->csrMat->num_rows, 1631 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1632 loTriFactorT->csrMat->num_entries, 1633 #endif 1634 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1635 loTriFactorT->csrMat->values->data().get(), 1636 loTriFactorT->csrMat->row_offsets->data().get(), 1637 loTriFactorT->csrMat->column_indices->data().get(), 1638 loTriFactorT->solveInfo, 1639 tempGPU->data().get(), 1640 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1641 xarray, 1642 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1643 #else 1644 xarray);CHKERRCUSPARSE(stat); 1645 #endif 1646 1647 /* restore */ 1648 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1649 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1650 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1651 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1652 PetscFunctionReturn(0); 1653 } 1654 1655 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1656 { 1657 const PetscScalar *barray; 1658 PetscScalar *xarray; 1659 thrust::device_ptr<const PetscScalar> bGPU; 1660 thrust::device_ptr<PetscScalar> xGPU; 1661 cusparseStatus_t stat; 1662 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1663 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1664 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1665 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1666 PetscErrorCode ierr; 1667 1668 PetscFunctionBegin; 1669 1670 /* Get the GPU pointers */ 1671 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1672 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1673 xGPU = thrust::device_pointer_cast(xarray); 1674 bGPU = thrust::device_pointer_cast(barray); 1675 1676 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1677 /* First, reorder with the row permutation */ 1678 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1679 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1680 tempGPU->begin()); 1681 1682 /* Next, solve L */ 1683 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1684 loTriFactor->csrMat->num_rows, 1685 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1686 loTriFactor->csrMat->num_entries, 1687 #endif 1688 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1689 loTriFactor->csrMat->values->data().get(), 1690 loTriFactor->csrMat->row_offsets->data().get(), 1691 loTriFactor->csrMat->column_indices->data().get(), 1692 loTriFactor->solveInfo, 1693 tempGPU->data().get(), 1694 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1695 xarray, 1696 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1697 #else 1698 xarray);CHKERRCUSPARSE(stat); 1699 #endif 1700 1701 /* Then, solve U */ 1702 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1703 upTriFactor->csrMat->num_rows, 1704 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1705 upTriFactor->csrMat->num_entries, 1706 #endif 1707 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1708 upTriFactor->csrMat->values->data().get(), 1709 upTriFactor->csrMat->row_offsets->data().get(), 1710 upTriFactor->csrMat->column_indices->data().get(), 1711 upTriFactor->solveInfo,xarray, 1712 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1713 tempGPU->data().get(), 1714 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1715 #else 1716 tempGPU->data().get());CHKERRCUSPARSE(stat); 1717 #endif 1718 1719 /* Last, reorder with the column permutation */ 1720 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1721 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1722 xGPU); 1723 1724 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1725 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1726 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1727 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1728 PetscFunctionReturn(0); 1729 } 1730 1731 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1732 { 1733 const PetscScalar *barray; 1734 PetscScalar *xarray; 1735 cusparseStatus_t stat; 1736 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1737 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1738 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1739 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1740 PetscErrorCode ierr; 1741 1742 PetscFunctionBegin; 1743 /* Get the GPU pointers */ 1744 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1745 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1746 1747 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1748 /* First, solve L */ 1749 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1750 loTriFactor->csrMat->num_rows, 1751 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1752 loTriFactor->csrMat->num_entries, 1753 #endif 1754 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1755 loTriFactor->csrMat->values->data().get(), 1756 loTriFactor->csrMat->row_offsets->data().get(), 1757 loTriFactor->csrMat->column_indices->data().get(), 1758 loTriFactor->solveInfo, 1759 barray, 1760 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1761 tempGPU->data().get(), 1762 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1763 #else 1764 tempGPU->data().get());CHKERRCUSPARSE(stat); 1765 #endif 1766 1767 /* Next, solve U */ 1768 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1769 upTriFactor->csrMat->num_rows, 1770 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1771 upTriFactor->csrMat->num_entries, 1772 #endif 1773 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1774 upTriFactor->csrMat->values->data().get(), 1775 upTriFactor->csrMat->row_offsets->data().get(), 1776 upTriFactor->csrMat->column_indices->data().get(), 1777 upTriFactor->solveInfo, 1778 tempGPU->data().get(), 1779 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1780 xarray, 1781 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1782 #else 1783 xarray);CHKERRCUSPARSE(stat); 1784 #endif 1785 1786 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1787 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1788 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1789 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1790 PetscFunctionReturn(0); 1791 } 1792 1793 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1794 { 1795 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1796 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1797 cudaError_t cerr; 1798 PetscErrorCode ierr; 1799 1800 PetscFunctionBegin; 1801 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1802 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1803 1804 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1805 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1806 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1807 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1808 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1809 A->offloadmask = PETSC_OFFLOAD_BOTH; 1810 } 1811 PetscFunctionReturn(0); 1812 } 1813 1814 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1815 { 1816 PetscErrorCode ierr; 1817 1818 PetscFunctionBegin; 1819 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1820 *array = ((Mat_SeqAIJ*)A->data)->a; 1821 PetscFunctionReturn(0); 1822 } 1823 1824 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1825 { 1826 PetscFunctionBegin; 1827 A->offloadmask = PETSC_OFFLOAD_CPU; 1828 *array = NULL; 1829 PetscFunctionReturn(0); 1830 } 1831 1832 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1833 { 1834 PetscErrorCode ierr; 1835 1836 PetscFunctionBegin; 1837 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1838 *array = ((Mat_SeqAIJ*)A->data)->a; 1839 PetscFunctionReturn(0); 1840 } 1841 1842 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1843 { 1844 PetscFunctionBegin; 1845 *array = NULL; 1846 PetscFunctionReturn(0); 1847 } 1848 1849 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1850 { 1851 PetscFunctionBegin; 1852 *array = ((Mat_SeqAIJ*)A->data)->a; 1853 PetscFunctionReturn(0); 1854 } 1855 1856 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1857 { 1858 PetscFunctionBegin; 1859 A->offloadmask = PETSC_OFFLOAD_CPU; 1860 *array = NULL; 1861 PetscFunctionReturn(0); 1862 } 1863 1864 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1865 { 1866 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1867 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1868 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1869 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1870 PetscErrorCode ierr; 1871 cusparseStatus_t stat; 1872 PetscBool both = PETSC_TRUE; 1873 cudaError_t err; 1874 1875 PetscFunctionBegin; 1876 PetscCheckFalse(A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1877 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1878 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1879 CsrMatrix *matrix; 1880 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1881 1882 PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1883 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1884 matrix->values->assign(a->a, a->a+a->nz); 1885 err = WaitForCUDA();CHKERRCUDA(err); 1886 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1887 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1888 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1889 } else { 1890 PetscInt nnz; 1891 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1892 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1893 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1894 delete cusparsestruct->workVector; 1895 delete cusparsestruct->rowoffsets_gpu; 1896 cusparsestruct->workVector = NULL; 1897 cusparsestruct->rowoffsets_gpu = NULL; 1898 try { 1899 if (a->compressedrow.use) { 1900 m = a->compressedrow.nrows; 1901 ii = a->compressedrow.i; 1902 ridx = a->compressedrow.rindex; 1903 } else { 1904 m = A->rmap->n; 1905 ii = a->i; 1906 ridx = NULL; 1907 } 1908 PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1909 PetscCheckFalse(m && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1910 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1911 else nnz = a->nz; 1912 1913 /* create cusparse matrix */ 1914 cusparsestruct->nrows = m; 1915 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1916 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1917 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1918 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1919 1920 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1921 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1922 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1923 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1924 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1925 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1926 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1927 1928 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1929 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1930 /* set the matrix */ 1931 CsrMatrix *mat= new CsrMatrix; 1932 mat->num_rows = m; 1933 mat->num_cols = A->cmap->n; 1934 mat->num_entries = nnz; 1935 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1936 mat->row_offsets->assign(ii, ii + m+1); 1937 1938 mat->column_indices = new THRUSTINTARRAY32(nnz); 1939 mat->column_indices->assign(a->j, a->j+nnz); 1940 1941 mat->values = new THRUSTARRAY(nnz); 1942 if (a->a) mat->values->assign(a->a, a->a+nnz); 1943 1944 /* assign the pointer */ 1945 matstruct->mat = mat; 1946 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1947 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1948 stat = cusparseCreateCsr(&matstruct->matDescr, 1949 mat->num_rows, mat->num_cols, mat->num_entries, 1950 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1951 mat->values->data().get(), 1952 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1953 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1954 } 1955 #endif 1956 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1957 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1958 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1959 #else 1960 CsrMatrix *mat= new CsrMatrix; 1961 mat->num_rows = m; 1962 mat->num_cols = A->cmap->n; 1963 mat->num_entries = nnz; 1964 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1965 mat->row_offsets->assign(ii, ii + m+1); 1966 1967 mat->column_indices = new THRUSTINTARRAY32(nnz); 1968 mat->column_indices->assign(a->j, a->j+nnz); 1969 1970 mat->values = new THRUSTARRAY(nnz); 1971 if (a->a) mat->values->assign(a->a, a->a+nnz); 1972 1973 cusparseHybMat_t hybMat; 1974 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1975 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1976 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1977 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1978 matstruct->descr, mat->values->data().get(), 1979 mat->row_offsets->data().get(), 1980 mat->column_indices->data().get(), 1981 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1982 /* assign the pointer */ 1983 matstruct->mat = hybMat; 1984 1985 if (mat) { 1986 if (mat->values) delete (THRUSTARRAY*)mat->values; 1987 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1988 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1989 delete (CsrMatrix*)mat; 1990 } 1991 #endif 1992 } 1993 1994 /* assign the compressed row indices */ 1995 if (a->compressedrow.use) { 1996 cusparsestruct->workVector = new THRUSTARRAY(m); 1997 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1998 matstruct->cprowIndices->assign(ridx,ridx+m); 1999 tmp = m; 2000 } else { 2001 cusparsestruct->workVector = NULL; 2002 matstruct->cprowIndices = NULL; 2003 tmp = 0; 2004 } 2005 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2006 2007 /* assign the pointer */ 2008 cusparsestruct->mat = matstruct; 2009 } catch(char *ex) { 2010 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2011 } 2012 err = WaitForCUDA();CHKERRCUDA(err); 2013 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 2014 cusparsestruct->nonzerostate = A->nonzerostate; 2015 } 2016 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2017 } 2018 PetscFunctionReturn(0); 2019 } 2020 2021 struct VecCUDAPlusEquals 2022 { 2023 template <typename Tuple> 2024 __host__ __device__ 2025 void operator()(Tuple t) 2026 { 2027 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2028 } 2029 }; 2030 2031 struct VecCUDAEquals 2032 { 2033 template <typename Tuple> 2034 __host__ __device__ 2035 void operator()(Tuple t) 2036 { 2037 thrust::get<1>(t) = thrust::get<0>(t); 2038 } 2039 }; 2040 2041 struct VecCUDAEqualsReverse 2042 { 2043 template <typename Tuple> 2044 __host__ __device__ 2045 void operator()(Tuple t) 2046 { 2047 thrust::get<0>(t) = thrust::get<1>(t); 2048 } 2049 }; 2050 2051 struct MatMatCusparse { 2052 PetscBool cisdense; 2053 PetscScalar *Bt; 2054 Mat X; 2055 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2056 PetscLogDouble flops; 2057 CsrMatrix *Bcsr; 2058 2059 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2060 cusparseSpMatDescr_t matSpBDescr; 2061 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2062 cusparseDnMatDescr_t matBDescr; 2063 cusparseDnMatDescr_t matCDescr; 2064 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2065 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2066 void *dBuffer4; 2067 void *dBuffer5; 2068 #endif 2069 size_t mmBufferSize; 2070 void *mmBuffer; 2071 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2072 cusparseSpGEMMDescr_t spgemmDesc; 2073 #endif 2074 }; 2075 2076 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2077 { 2078 PetscErrorCode ierr; 2079 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2080 cudaError_t cerr; 2081 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2082 cusparseStatus_t stat; 2083 #endif 2084 2085 PetscFunctionBegin; 2086 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2087 delete mmdata->Bcsr; 2088 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2089 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2090 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2091 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2092 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2093 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2094 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2095 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2096 #endif 2097 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2098 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2099 #endif 2100 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2101 ierr = PetscFree(data);CHKERRQ(ierr); 2102 PetscFunctionReturn(0); 2103 } 2104 2105 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2106 2107 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2108 { 2109 Mat_Product *product = C->product; 2110 Mat A,B; 2111 PetscInt m,n,blda,clda; 2112 PetscBool flg,biscuda; 2113 Mat_SeqAIJCUSPARSE *cusp; 2114 cusparseStatus_t stat; 2115 cusparseOperation_t opA; 2116 const PetscScalar *barray; 2117 PetscScalar *carray; 2118 PetscErrorCode ierr; 2119 MatMatCusparse *mmdata; 2120 Mat_SeqAIJCUSPARSEMultStruct *mat; 2121 CsrMatrix *csrmat; 2122 2123 PetscFunctionBegin; 2124 MatCheckProduct(C,1); 2125 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2126 mmdata = (MatMatCusparse*)product->data; 2127 A = product->A; 2128 B = product->B; 2129 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2130 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2131 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2132 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2133 PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2134 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2135 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2136 switch (product->type) { 2137 case MATPRODUCT_AB: 2138 case MATPRODUCT_PtAP: 2139 mat = cusp->mat; 2140 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2141 m = A->rmap->n; 2142 n = B->cmap->n; 2143 break; 2144 case MATPRODUCT_AtB: 2145 if (!A->form_explicit_transpose) { 2146 mat = cusp->mat; 2147 opA = CUSPARSE_OPERATION_TRANSPOSE; 2148 } else { 2149 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2150 mat = cusp->matTranspose; 2151 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2152 } 2153 m = A->cmap->n; 2154 n = B->cmap->n; 2155 break; 2156 case MATPRODUCT_ABt: 2157 case MATPRODUCT_RARt: 2158 mat = cusp->mat; 2159 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2160 m = A->rmap->n; 2161 n = B->rmap->n; 2162 break; 2163 default: 2164 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2165 } 2166 PetscCheckFalse(!mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2167 csrmat = (CsrMatrix*)mat->mat; 2168 /* if the user passed a CPU matrix, copy the data to the GPU */ 2169 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2170 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2171 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2172 2173 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2174 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2175 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2176 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2177 } else { 2178 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2179 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2180 } 2181 2182 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2183 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2184 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2185 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2186 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2187 size_t mmBufferSize; 2188 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2189 if (!mmdata->matBDescr) { 2190 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2191 mmdata->Blda = blda; 2192 } 2193 2194 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2195 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2196 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2197 mmdata->Clda = clda; 2198 } 2199 2200 if (!mat->matDescr) { 2201 stat = cusparseCreateCsr(&mat->matDescr, 2202 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2203 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2204 csrmat->values->data().get(), 2205 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2206 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2207 } 2208 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2209 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2210 mmdata->matCDescr,cusparse_scalartype, 2211 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2212 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2213 cudaError_t cerr; 2214 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2215 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2216 mmdata->mmBufferSize = mmBufferSize; 2217 } 2218 mmdata->initialized = PETSC_TRUE; 2219 } else { 2220 /* to be safe, always update pointers of the mats */ 2221 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2222 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2223 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2224 } 2225 2226 /* do cusparseSpMM, which supports transpose on B */ 2227 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2228 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2229 mmdata->matCDescr,cusparse_scalartype, 2230 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2231 #else 2232 PetscInt k; 2233 /* cusparseXcsrmm does not support transpose on B */ 2234 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2235 cublasHandle_t cublasv2handle; 2236 cublasStatus_t cerr; 2237 2238 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2239 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2240 B->cmap->n,B->rmap->n, 2241 &PETSC_CUSPARSE_ONE ,barray,blda, 2242 &PETSC_CUSPARSE_ZERO,barray,blda, 2243 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2244 blda = B->cmap->n; 2245 k = B->cmap->n; 2246 } else { 2247 k = B->rmap->n; 2248 } 2249 2250 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2251 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2252 csrmat->num_entries,mat->alpha_one,mat->descr, 2253 csrmat->values->data().get(), 2254 csrmat->row_offsets->data().get(), 2255 csrmat->column_indices->data().get(), 2256 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2257 carray,clda);CHKERRCUSPARSE(stat); 2258 #endif 2259 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2260 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2261 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2262 if (product->type == MATPRODUCT_RARt) { 2263 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2264 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2265 } else if (product->type == MATPRODUCT_PtAP) { 2266 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2267 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2268 } else { 2269 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2270 } 2271 if (mmdata->cisdense) { 2272 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2273 } 2274 if (!biscuda) { 2275 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2276 } 2277 PetscFunctionReturn(0); 2278 } 2279 2280 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2281 { 2282 Mat_Product *product = C->product; 2283 Mat A,B; 2284 PetscInt m,n; 2285 PetscBool cisdense,flg; 2286 PetscErrorCode ierr; 2287 MatMatCusparse *mmdata; 2288 Mat_SeqAIJCUSPARSE *cusp; 2289 2290 PetscFunctionBegin; 2291 MatCheckProduct(C,1); 2292 PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2293 A = product->A; 2294 B = product->B; 2295 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2296 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2297 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2298 PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2299 switch (product->type) { 2300 case MATPRODUCT_AB: 2301 m = A->rmap->n; 2302 n = B->cmap->n; 2303 break; 2304 case MATPRODUCT_AtB: 2305 m = A->cmap->n; 2306 n = B->cmap->n; 2307 break; 2308 case MATPRODUCT_ABt: 2309 m = A->rmap->n; 2310 n = B->rmap->n; 2311 break; 2312 case MATPRODUCT_PtAP: 2313 m = B->cmap->n; 2314 n = B->cmap->n; 2315 break; 2316 case MATPRODUCT_RARt: 2317 m = B->rmap->n; 2318 n = B->rmap->n; 2319 break; 2320 default: 2321 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2322 } 2323 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2324 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2325 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2326 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2327 2328 /* product data */ 2329 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2330 mmdata->cisdense = cisdense; 2331 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2332 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2333 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2334 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2335 } 2336 #endif 2337 /* for these products we need intermediate storage */ 2338 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2339 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2340 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2341 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2342 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2343 } else { 2344 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2345 } 2346 } 2347 C->product->data = mmdata; 2348 C->product->destroy = MatDestroy_MatMatCusparse; 2349 2350 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2351 PetscFunctionReturn(0); 2352 } 2353 2354 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2355 { 2356 Mat_Product *product = C->product; 2357 Mat A,B; 2358 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2359 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2360 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2361 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2362 PetscBool flg; 2363 PetscErrorCode ierr; 2364 cusparseStatus_t stat; 2365 cudaError_t cerr; 2366 MatProductType ptype; 2367 MatMatCusparse *mmdata; 2368 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2369 cusparseSpMatDescr_t BmatSpDescr; 2370 #endif 2371 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2372 2373 PetscFunctionBegin; 2374 MatCheckProduct(C,1); 2375 PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2376 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2377 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2378 mmdata = (MatMatCusparse*)C->product->data; 2379 A = product->A; 2380 B = product->B; 2381 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2382 mmdata->reusesym = PETSC_FALSE; 2383 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2384 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2385 Cmat = Ccusp->mat; 2386 PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2387 Ccsr = (CsrMatrix*)Cmat->mat; 2388 PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2389 goto finalize; 2390 } 2391 if (!c->nz) goto finalize; 2392 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2393 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2394 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2395 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2396 PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2397 PetscCheckFalse(B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2398 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2399 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2400 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2401 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2402 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2403 PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2404 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2405 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2406 2407 ptype = product->type; 2408 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2409 ptype = MATPRODUCT_AB; 2410 PetscCheckFalse(!product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2411 } 2412 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2413 ptype = MATPRODUCT_AB; 2414 PetscCheckFalse(!product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2415 } 2416 switch (ptype) { 2417 case MATPRODUCT_AB: 2418 Amat = Acusp->mat; 2419 Bmat = Bcusp->mat; 2420 break; 2421 case MATPRODUCT_AtB: 2422 Amat = Acusp->matTranspose; 2423 Bmat = Bcusp->mat; 2424 break; 2425 case MATPRODUCT_ABt: 2426 Amat = Acusp->mat; 2427 Bmat = Bcusp->matTranspose; 2428 break; 2429 default: 2430 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2431 } 2432 Cmat = Ccusp->mat; 2433 PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2434 PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2435 PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2436 Acsr = (CsrMatrix*)Amat->mat; 2437 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2438 Ccsr = (CsrMatrix*)Cmat->mat; 2439 PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2440 PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2441 PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2442 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2443 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2444 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2445 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2446 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2447 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2448 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2449 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2450 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2451 #else 2452 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2453 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2454 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2455 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2456 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2457 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2458 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2459 #endif 2460 #else 2461 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2462 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2463 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2464 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2465 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2466 #endif 2467 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2468 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2469 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2470 C->offloadmask = PETSC_OFFLOAD_GPU; 2471 finalize: 2472 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2473 ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2474 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2475 ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 2476 c->reallocs = 0; 2477 C->info.mallocs += 0; 2478 C->info.nz_unneeded = 0; 2479 C->assembled = C->was_assembled = PETSC_TRUE; 2480 C->num_ass++; 2481 PetscFunctionReturn(0); 2482 } 2483 2484 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2485 { 2486 Mat_Product *product = C->product; 2487 Mat A,B; 2488 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2489 Mat_SeqAIJ *a,*b,*c; 2490 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2491 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2492 PetscInt i,j,m,n,k; 2493 PetscBool flg; 2494 PetscErrorCode ierr; 2495 cusparseStatus_t stat; 2496 cudaError_t cerr; 2497 MatProductType ptype; 2498 MatMatCusparse *mmdata; 2499 PetscLogDouble flops; 2500 PetscBool biscompressed,ciscompressed; 2501 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2502 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2503 cusparseSpMatDescr_t BmatSpDescr; 2504 #else 2505 int cnz; 2506 #endif 2507 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2508 2509 PetscFunctionBegin; 2510 MatCheckProduct(C,1); 2511 PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2512 A = product->A; 2513 B = product->B; 2514 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2515 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2516 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2517 PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2518 a = (Mat_SeqAIJ*)A->data; 2519 b = (Mat_SeqAIJ*)B->data; 2520 /* product data */ 2521 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2522 C->product->data = mmdata; 2523 C->product->destroy = MatDestroy_MatMatCusparse; 2524 2525 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2526 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2527 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2528 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2529 PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2530 PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2531 2532 ptype = product->type; 2533 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2534 ptype = MATPRODUCT_AB; 2535 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2536 } 2537 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2538 ptype = MATPRODUCT_AB; 2539 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2540 } 2541 biscompressed = PETSC_FALSE; 2542 ciscompressed = PETSC_FALSE; 2543 switch (ptype) { 2544 case MATPRODUCT_AB: 2545 m = A->rmap->n; 2546 n = B->cmap->n; 2547 k = A->cmap->n; 2548 Amat = Acusp->mat; 2549 Bmat = Bcusp->mat; 2550 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2551 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2552 break; 2553 case MATPRODUCT_AtB: 2554 m = A->cmap->n; 2555 n = B->cmap->n; 2556 k = A->rmap->n; 2557 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2558 Amat = Acusp->matTranspose; 2559 Bmat = Bcusp->mat; 2560 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2561 break; 2562 case MATPRODUCT_ABt: 2563 m = A->rmap->n; 2564 n = B->rmap->n; 2565 k = A->cmap->n; 2566 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2567 Amat = Acusp->mat; 2568 Bmat = Bcusp->matTranspose; 2569 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2570 break; 2571 default: 2572 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2573 } 2574 2575 /* create cusparse matrix */ 2576 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2577 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2578 c = (Mat_SeqAIJ*)C->data; 2579 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2580 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2581 Ccsr = new CsrMatrix; 2582 2583 c->compressedrow.use = ciscompressed; 2584 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2585 c->compressedrow.nrows = a->compressedrow.nrows; 2586 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2587 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2588 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2589 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2590 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2591 } else { 2592 c->compressedrow.nrows = 0; 2593 c->compressedrow.i = NULL; 2594 c->compressedrow.rindex = NULL; 2595 Ccusp->workVector = NULL; 2596 Cmat->cprowIndices = NULL; 2597 } 2598 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2599 Ccusp->mat = Cmat; 2600 Ccusp->mat->mat = Ccsr; 2601 Ccsr->num_rows = Ccusp->nrows; 2602 Ccsr->num_cols = n; 2603 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2604 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2605 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2606 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2607 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2608 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2609 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2610 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2611 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2612 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2613 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2614 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2615 c->nz = 0; 2616 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2617 Ccsr->values = new THRUSTARRAY(c->nz); 2618 goto finalizesym; 2619 } 2620 2621 PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2622 PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2623 Acsr = (CsrMatrix*)Amat->mat; 2624 if (!biscompressed) { 2625 Bcsr = (CsrMatrix*)Bmat->mat; 2626 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2627 BmatSpDescr = Bmat->matDescr; 2628 #endif 2629 } else { /* we need to use row offsets for the full matrix */ 2630 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2631 Bcsr = new CsrMatrix; 2632 Bcsr->num_rows = B->rmap->n; 2633 Bcsr->num_cols = cBcsr->num_cols; 2634 Bcsr->num_entries = cBcsr->num_entries; 2635 Bcsr->column_indices = cBcsr->column_indices; 2636 Bcsr->values = cBcsr->values; 2637 if (!Bcusp->rowoffsets_gpu) { 2638 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2639 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2640 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2641 } 2642 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2643 mmdata->Bcsr = Bcsr; 2644 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2645 if (Bcsr->num_rows && Bcsr->num_cols) { 2646 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2647 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2648 Bcsr->values->data().get(), 2649 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2650 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2651 } 2652 BmatSpDescr = mmdata->matSpBDescr; 2653 #endif 2654 } 2655 PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2656 PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2657 /* precompute flops count */ 2658 if (ptype == MATPRODUCT_AB) { 2659 for (i=0, flops = 0; i<A->rmap->n; i++) { 2660 const PetscInt st = a->i[i]; 2661 const PetscInt en = a->i[i+1]; 2662 for (j=st; j<en; j++) { 2663 const PetscInt brow = a->j[j]; 2664 flops += 2.*(b->i[brow+1] - b->i[brow]); 2665 } 2666 } 2667 } else if (ptype == MATPRODUCT_AtB) { 2668 for (i=0, flops = 0; i<A->rmap->n; i++) { 2669 const PetscInt anzi = a->i[i+1] - a->i[i]; 2670 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2671 flops += (2.*anzi)*bnzi; 2672 } 2673 } else { /* TODO */ 2674 flops = 0.; 2675 } 2676 2677 mmdata->flops = flops; 2678 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2679 2680 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2681 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2682 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2683 NULL, NULL, NULL, 2684 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2685 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2686 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2687 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2688 { 2689 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2690 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2691 */ 2692 void* dBuffer1 = NULL; 2693 void* dBuffer2 = NULL; 2694 void* dBuffer3 = NULL; 2695 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2696 size_t bufferSize1 = 0; 2697 size_t bufferSize2 = 0; 2698 size_t bufferSize3 = 0; 2699 size_t bufferSize4 = 0; 2700 size_t bufferSize5 = 0; 2701 2702 /*----------------------------------------------------------------------*/ 2703 /* ask bufferSize1 bytes for external memory */ 2704 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2705 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2706 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2707 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2708 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2709 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2710 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2711 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2712 2713 /*----------------------------------------------------------------------*/ 2714 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2715 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2716 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2717 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2718 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2719 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2720 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2721 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2722 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2723 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2724 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2725 2726 /*----------------------------------------------------------------------*/ 2727 /* get matrix C non-zero entries C_nnz1 */ 2728 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2729 c->nz = (PetscInt) C_nnz1; 2730 /* allocate matrix C */ 2731 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2732 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2733 /* update matC with the new pointers */ 2734 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2735 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2736 2737 /*----------------------------------------------------------------------*/ 2738 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2739 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2740 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2741 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2742 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2743 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2744 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2745 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2746 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2747 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2748 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2749 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2750 ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2751 } 2752 #else 2753 size_t bufSize2; 2754 /* ask bufferSize bytes for external memory */ 2755 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2756 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2757 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2758 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2759 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2760 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2761 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2762 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2763 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2764 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2765 /* ask bufferSize again bytes for external memory */ 2766 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2767 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2768 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2769 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2770 /* The CUSPARSE documentation is not clear, nor the API 2771 We need both buffers to perform the operations properly! 2772 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2773 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2774 is stored in the descriptor! What a messy API... */ 2775 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2776 /* compute the intermediate product of A * B */ 2777 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2778 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2779 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2780 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2781 /* get matrix C non-zero entries C_nnz1 */ 2782 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2783 c->nz = (PetscInt) C_nnz1; 2784 ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2785 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2786 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2787 Ccsr->values = new THRUSTARRAY(c->nz); 2788 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2789 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2790 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2791 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2792 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2793 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2794 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2795 #else 2796 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2797 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2798 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2799 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2800 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2801 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2802 c->nz = cnz; 2803 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2804 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2805 Ccsr->values = new THRUSTARRAY(c->nz); 2806 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2807 2808 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2809 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2810 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2811 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2812 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2813 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2814 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2815 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2816 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2817 #endif 2818 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2819 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2820 finalizesym: 2821 c->singlemalloc = PETSC_FALSE; 2822 c->free_a = PETSC_TRUE; 2823 c->free_ij = PETSC_TRUE; 2824 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2825 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2826 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2827 PetscInt *d_i = c->i; 2828 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2829 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2830 ii = *Ccsr->row_offsets; 2831 jj = *Ccsr->column_indices; 2832 if (ciscompressed) d_i = c->compressedrow.i; 2833 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2834 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2835 } else { 2836 PetscInt *d_i = c->i; 2837 if (ciscompressed) d_i = c->compressedrow.i; 2838 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2839 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2840 } 2841 if (ciscompressed) { /* need to expand host row offsets */ 2842 PetscInt r = 0; 2843 c->i[0] = 0; 2844 for (k = 0; k < c->compressedrow.nrows; k++) { 2845 const PetscInt next = c->compressedrow.rindex[k]; 2846 const PetscInt old = c->compressedrow.i[k]; 2847 for (; r < next; r++) c->i[r+1] = old; 2848 } 2849 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2850 } 2851 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2852 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2853 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2854 c->maxnz = c->nz; 2855 c->nonzerorowcnt = 0; 2856 c->rmax = 0; 2857 for (k = 0; k < m; k++) { 2858 const PetscInt nn = c->i[k+1] - c->i[k]; 2859 c->ilen[k] = c->imax[k] = nn; 2860 c->nonzerorowcnt += (PetscInt)!!nn; 2861 c->rmax = PetscMax(c->rmax,nn); 2862 } 2863 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2864 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2865 Ccsr->num_entries = c->nz; 2866 2867 C->nonzerostate++; 2868 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2869 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2870 Ccusp->nonzerostate = C->nonzerostate; 2871 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2872 C->preallocated = PETSC_TRUE; 2873 C->assembled = PETSC_FALSE; 2874 C->was_assembled = PETSC_FALSE; 2875 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2876 mmdata->reusesym = PETSC_TRUE; 2877 C->offloadmask = PETSC_OFFLOAD_GPU; 2878 } 2879 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2880 PetscFunctionReturn(0); 2881 } 2882 2883 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2884 2885 /* handles sparse or dense B */ 2886 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2887 { 2888 Mat_Product *product = mat->product; 2889 PetscErrorCode ierr; 2890 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2891 2892 PetscFunctionBegin; 2893 MatCheckProduct(mat,1); 2894 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2895 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2896 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2897 } 2898 if (product->type == MATPRODUCT_ABC) { 2899 Ciscusp = PETSC_FALSE; 2900 if (!product->C->boundtocpu) { 2901 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2902 } 2903 } 2904 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2905 PetscBool usecpu = PETSC_FALSE; 2906 switch (product->type) { 2907 case MATPRODUCT_AB: 2908 if (product->api_user) { 2909 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2910 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2911 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2912 } else { 2913 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2914 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2915 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2916 } 2917 break; 2918 case MATPRODUCT_AtB: 2919 if (product->api_user) { 2920 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2921 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2922 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2923 } else { 2924 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2925 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2926 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2927 } 2928 break; 2929 case MATPRODUCT_PtAP: 2930 if (product->api_user) { 2931 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2932 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2933 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2934 } else { 2935 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2936 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2937 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2938 } 2939 break; 2940 case MATPRODUCT_RARt: 2941 if (product->api_user) { 2942 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2943 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2944 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2945 } else { 2946 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2947 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2948 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2949 } 2950 break; 2951 case MATPRODUCT_ABC: 2952 if (product->api_user) { 2953 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2954 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2955 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2956 } else { 2957 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2958 ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2959 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2960 } 2961 break; 2962 default: 2963 break; 2964 } 2965 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2966 } 2967 /* dispatch */ 2968 if (isdense) { 2969 switch (product->type) { 2970 case MATPRODUCT_AB: 2971 case MATPRODUCT_AtB: 2972 case MATPRODUCT_ABt: 2973 case MATPRODUCT_PtAP: 2974 case MATPRODUCT_RARt: 2975 if (product->A->boundtocpu) { 2976 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2977 } else { 2978 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2979 } 2980 break; 2981 case MATPRODUCT_ABC: 2982 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2983 break; 2984 default: 2985 break; 2986 } 2987 } else if (Biscusp && Ciscusp) { 2988 switch (product->type) { 2989 case MATPRODUCT_AB: 2990 case MATPRODUCT_AtB: 2991 case MATPRODUCT_ABt: 2992 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2993 break; 2994 case MATPRODUCT_PtAP: 2995 case MATPRODUCT_RARt: 2996 case MATPRODUCT_ABC: 2997 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2998 break; 2999 default: 3000 break; 3001 } 3002 } else { /* fallback for AIJ */ 3003 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3004 } 3005 PetscFunctionReturn(0); 3006 } 3007 3008 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3009 { 3010 PetscErrorCode ierr; 3011 3012 PetscFunctionBegin; 3013 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3014 PetscFunctionReturn(0); 3015 } 3016 3017 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3018 { 3019 PetscErrorCode ierr; 3020 3021 PetscFunctionBegin; 3022 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3023 PetscFunctionReturn(0); 3024 } 3025 3026 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3027 { 3028 PetscErrorCode ierr; 3029 3030 PetscFunctionBegin; 3031 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3032 PetscFunctionReturn(0); 3033 } 3034 3035 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3036 { 3037 PetscErrorCode ierr; 3038 3039 PetscFunctionBegin; 3040 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3041 PetscFunctionReturn(0); 3042 } 3043 3044 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3045 { 3046 PetscErrorCode ierr; 3047 3048 PetscFunctionBegin; 3049 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3050 PetscFunctionReturn(0); 3051 } 3052 3053 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3054 { 3055 int i = blockIdx.x*blockDim.x + threadIdx.x; 3056 if (i < n) y[idx[i]] += x[i]; 3057 } 3058 3059 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3060 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3061 { 3062 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3063 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3064 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3065 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3066 PetscErrorCode ierr; 3067 cusparseStatus_t stat; 3068 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3069 PetscBool compressed; 3070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3071 PetscInt nx,ny; 3072 #endif 3073 3074 PetscFunctionBegin; 3075 PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3076 if (!a->nonzerorowcnt) { 3077 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3078 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3079 PetscFunctionReturn(0); 3080 } 3081 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3082 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3083 if (!trans) { 3084 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3085 PetscCheckFalse(!matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3086 } else { 3087 if (herm || !A->form_explicit_transpose) { 3088 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3089 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3090 } else { 3091 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3092 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3093 } 3094 } 3095 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3096 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3097 3098 try { 3099 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3100 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3101 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3102 3103 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3104 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3105 /* z = A x + beta y. 3106 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3107 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3108 */ 3109 xptr = xarray; 3110 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3111 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3112 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3113 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3114 allocated to accommodate different uses. So we get the length info directly from mat. 3115 */ 3116 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3117 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3118 nx = mat->num_cols; 3119 ny = mat->num_rows; 3120 } 3121 #endif 3122 } else { 3123 /* z = A^T x + beta y 3124 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3125 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3126 */ 3127 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3128 dptr = zarray; 3129 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3130 if (compressed) { /* Scatter x to work vector */ 3131 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3132 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3133 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3134 VecCUDAEqualsReverse()); 3135 } 3136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3137 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3138 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3139 nx = mat->num_rows; 3140 ny = mat->num_cols; 3141 } 3142 #endif 3143 } 3144 3145 /* csr_spmv does y = alpha op(A) x + beta y */ 3146 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3147 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3148 PetscCheckFalse(opA < 0 || opA > 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3149 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3150 cudaError_t cerr; 3151 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3152 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3153 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3154 matstruct->matDescr, 3155 matstruct->cuSpMV[opA].vecXDescr, beta, 3156 matstruct->cuSpMV[opA].vecYDescr, 3157 cusparse_scalartype, 3158 cusparsestruct->spmvAlg, 3159 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3160 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3161 3162 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3163 } else { 3164 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3165 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3166 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3167 } 3168 3169 stat = cusparseSpMV(cusparsestruct->handle, opA, 3170 matstruct->alpha_one, 3171 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3172 matstruct->cuSpMV[opA].vecXDescr, 3173 beta, 3174 matstruct->cuSpMV[opA].vecYDescr, 3175 cusparse_scalartype, 3176 cusparsestruct->spmvAlg, 3177 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3178 #else 3179 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3180 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3181 mat->num_rows, mat->num_cols, 3182 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3183 mat->values->data().get(), mat->row_offsets->data().get(), 3184 mat->column_indices->data().get(), xptr, beta, 3185 dptr);CHKERRCUSPARSE(stat); 3186 #endif 3187 } else { 3188 if (cusparsestruct->nrows) { 3189 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3190 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3191 #else 3192 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3193 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3194 matstruct->alpha_one, matstruct->descr, hybMat, 3195 xptr, beta, 3196 dptr);CHKERRCUSPARSE(stat); 3197 #endif 3198 } 3199 } 3200 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3201 3202 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3203 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3204 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3205 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3206 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3207 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3208 } 3209 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3210 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3211 } 3212 3213 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3214 if (compressed) { 3215 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3216 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3217 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3218 prevent that. So I just add a ScatterAdd kernel. 3219 */ 3220 #if 0 3221 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3222 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3223 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3224 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3225 VecCUDAPlusEquals()); 3226 #else 3227 PetscInt n = matstruct->cprowIndices->size(); 3228 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3229 #endif 3230 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3231 } 3232 } else { 3233 if (yy && yy != zz) { 3234 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3235 } 3236 } 3237 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3238 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3239 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3240 } catch(char *ex) { 3241 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3242 } 3243 if (yy) { 3244 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3245 } else { 3246 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3247 } 3248 PetscFunctionReturn(0); 3249 } 3250 3251 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3252 { 3253 PetscErrorCode ierr; 3254 3255 PetscFunctionBegin; 3256 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3257 PetscFunctionReturn(0); 3258 } 3259 3260 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3261 { 3262 PetscErrorCode ierr; 3263 PetscObjectState onnz = A->nonzerostate; 3264 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3265 3266 PetscFunctionBegin; 3267 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3268 if (onnz != A->nonzerostate && cusp->deviceMat) { 3269 cudaError_t cerr; 3270 3271 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3272 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3273 cusp->deviceMat = NULL; 3274 } 3275 PetscFunctionReturn(0); 3276 } 3277 3278 /* --------------------------------------------------------------------------------*/ 3279 /*@ 3280 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3281 (the default parallel PETSc format). This matrix will ultimately pushed down 3282 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3283 assembly performance the user should preallocate the matrix storage by setting 3284 the parameter nz (or the array nnz). By setting these parameters accurately, 3285 performance during matrix assembly can be increased by more than a factor of 50. 3286 3287 Collective 3288 3289 Input Parameters: 3290 + comm - MPI communicator, set to PETSC_COMM_SELF 3291 . m - number of rows 3292 . n - number of columns 3293 . nz - number of nonzeros per row (same for all rows) 3294 - nnz - array containing the number of nonzeros in the various rows 3295 (possibly different for each row) or NULL 3296 3297 Output Parameter: 3298 . A - the matrix 3299 3300 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3301 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3302 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3303 3304 Notes: 3305 If nnz is given then nz is ignored 3306 3307 The AIJ format (also called the Yale sparse matrix format or 3308 compressed row storage), is fully compatible with standard Fortran 77 3309 storage. That is, the stored row and column indices can begin at 3310 either one (as in Fortran) or zero. See the users' manual for details. 3311 3312 Specify the preallocated storage with either nz or nnz (not both). 3313 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3314 allocation. For large problems you MUST preallocate memory or you 3315 will get TERRIBLE performance, see the users' manual chapter on matrices. 3316 3317 By default, this format uses inodes (identical nodes) when possible, to 3318 improve numerical efficiency of matrix-vector products and solves. We 3319 search for consecutive rows with the same nonzero structure, thereby 3320 reusing matrix information to achieve increased efficiency. 3321 3322 Level: intermediate 3323 3324 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3325 @*/ 3326 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3327 { 3328 PetscErrorCode ierr; 3329 3330 PetscFunctionBegin; 3331 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3332 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3333 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3334 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3335 PetscFunctionReturn(0); 3336 } 3337 3338 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3339 { 3340 PetscErrorCode ierr; 3341 3342 PetscFunctionBegin; 3343 if (A->factortype == MAT_FACTOR_NONE) { 3344 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3345 } else { 3346 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3347 } 3348 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3349 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3350 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3351 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3357 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3358 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3359 PetscFunctionReturn(0); 3360 } 3361 3362 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3363 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3364 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3365 { 3366 PetscErrorCode ierr; 3367 3368 PetscFunctionBegin; 3369 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3370 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3371 PetscFunctionReturn(0); 3372 } 3373 3374 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3375 { 3376 PetscErrorCode ierr; 3377 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3378 Mat_SeqAIJCUSPARSE *cy; 3379 Mat_SeqAIJCUSPARSE *cx; 3380 PetscScalar *ay; 3381 const PetscScalar *ax; 3382 CsrMatrix *csry,*csrx; 3383 3384 PetscFunctionBegin; 3385 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3386 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3387 if (X->ops->axpy != Y->ops->axpy) { 3388 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3389 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3390 PetscFunctionReturn(0); 3391 } 3392 /* if we are here, it means both matrices are bound to GPU */ 3393 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3394 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3395 PetscCheckFalse(cy->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3396 PetscCheckFalse(cx->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3397 csry = (CsrMatrix*)cy->mat->mat; 3398 csrx = (CsrMatrix*)cx->mat->mat; 3399 /* see if we can turn this into a cublas axpy */ 3400 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3401 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3402 if (eq) { 3403 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3404 } 3405 if (eq) str = SAME_NONZERO_PATTERN; 3406 } 3407 /* spgeam is buggy with one column */ 3408 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3409 3410 if (str == SUBSET_NONZERO_PATTERN) { 3411 cusparseStatus_t stat; 3412 PetscScalar b = 1.0; 3413 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3414 size_t bufferSize; 3415 void *buffer; 3416 cudaError_t cerr; 3417 #endif 3418 3419 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3420 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3421 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3422 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3423 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3424 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3425 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3426 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3427 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3428 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3429 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3430 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3431 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3432 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3433 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3434 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3435 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3436 #else 3437 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3438 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3439 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3440 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3441 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3442 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3443 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3444 #endif 3445 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3446 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3447 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3448 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3449 } else if (str == SAME_NONZERO_PATTERN) { 3450 cublasHandle_t cublasv2handle; 3451 cublasStatus_t berr; 3452 PetscBLASInt one = 1, bnz = 1; 3453 3454 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3455 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3456 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3457 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3458 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3459 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3460 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3461 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3462 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3463 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3464 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3465 } else { 3466 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3467 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3468 } 3469 PetscFunctionReturn(0); 3470 } 3471 3472 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3473 { 3474 PetscErrorCode ierr; 3475 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3476 PetscScalar *ay; 3477 cublasHandle_t cublasv2handle; 3478 cublasStatus_t berr; 3479 PetscBLASInt one = 1, bnz = 1; 3480 3481 PetscFunctionBegin; 3482 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3483 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3484 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3485 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3486 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3487 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3488 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3489 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3490 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3491 PetscFunctionReturn(0); 3492 } 3493 3494 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3495 { 3496 PetscErrorCode ierr; 3497 PetscBool both = PETSC_FALSE; 3498 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3499 3500 PetscFunctionBegin; 3501 if (A->factortype == MAT_FACTOR_NONE) { 3502 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3503 if (spptr->mat) { 3504 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3505 if (matrix->values) { 3506 both = PETSC_TRUE; 3507 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3508 } 3509 } 3510 if (spptr->matTranspose) { 3511 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3512 if (matrix->values) { 3513 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3514 } 3515 } 3516 } 3517 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3518 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3519 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3520 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3521 else A->offloadmask = PETSC_OFFLOAD_CPU; 3522 PetscFunctionReturn(0); 3523 } 3524 3525 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3526 { 3527 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3528 PetscErrorCode ierr; 3529 3530 PetscFunctionBegin; 3531 if (A->factortype != MAT_FACTOR_NONE) { 3532 A->boundtocpu = flg; 3533 PetscFunctionReturn(0); 3534 } 3535 if (flg) { 3536 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3537 3538 A->ops->scale = MatScale_SeqAIJ; 3539 A->ops->axpy = MatAXPY_SeqAIJ; 3540 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3541 A->ops->mult = MatMult_SeqAIJ; 3542 A->ops->multadd = MatMultAdd_SeqAIJ; 3543 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3544 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3545 A->ops->multhermitiantranspose = NULL; 3546 A->ops->multhermitiantransposeadd = NULL; 3547 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3548 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3549 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3553 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3554 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3555 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3556 } else { 3557 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3558 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3559 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3560 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3561 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3562 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3563 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3564 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3565 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3566 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3567 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3568 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3569 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3570 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3571 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3572 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3573 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3574 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3575 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3576 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3577 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3579 } 3580 A->boundtocpu = flg; 3581 if (flg && a->inode.size) { 3582 a->inode.use = PETSC_TRUE; 3583 } else { 3584 a->inode.use = PETSC_FALSE; 3585 } 3586 PetscFunctionReturn(0); 3587 } 3588 3589 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3590 { 3591 PetscErrorCode ierr; 3592 cusparseStatus_t stat; 3593 Mat B; 3594 3595 PetscFunctionBegin; 3596 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3597 if (reuse == MAT_INITIAL_MATRIX) { 3598 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3599 } else if (reuse == MAT_REUSE_MATRIX) { 3600 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3601 } 3602 B = *newmat; 3603 3604 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3605 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3606 3607 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3608 if (B->factortype == MAT_FACTOR_NONE) { 3609 Mat_SeqAIJCUSPARSE *spptr; 3610 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3611 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3612 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3613 spptr->format = MAT_CUSPARSE_CSR; 3614 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3615 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3616 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3617 #else 3618 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3619 #endif 3620 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3621 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3622 #endif 3623 B->spptr = spptr; 3624 } else { 3625 Mat_SeqAIJCUSPARSETriFactors *spptr; 3626 3627 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3628 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3629 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3630 B->spptr = spptr; 3631 } 3632 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3633 } 3634 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3635 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3636 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3637 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3638 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3639 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3640 3641 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3642 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3643 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3644 #if defined(PETSC_HAVE_HYPRE) 3645 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3646 #endif 3647 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 3648 PetscFunctionReturn(0); 3649 } 3650 3651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3652 { 3653 PetscErrorCode ierr; 3654 3655 PetscFunctionBegin; 3656 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3657 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3658 PetscFunctionReturn(0); 3659 } 3660 3661 /*MC 3662 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3663 3664 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3665 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3666 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3667 3668 Options Database Keys: 3669 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3670 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3671 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3672 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3673 3674 Level: beginner 3675 3676 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3677 M*/ 3678 3679 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3680 3681 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3682 { 3683 PetscErrorCode ierr; 3684 3685 PetscFunctionBegin; 3686 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3687 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3688 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3689 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3690 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3691 3692 PetscFunctionReturn(0); 3693 } 3694 3695 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3696 { 3697 PetscErrorCode ierr; 3698 cusparseStatus_t stat; 3699 3700 PetscFunctionBegin; 3701 if (*cusparsestruct) { 3702 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3703 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3704 delete (*cusparsestruct)->workVector; 3705 delete (*cusparsestruct)->rowoffsets_gpu; 3706 delete (*cusparsestruct)->cooPerm; 3707 delete (*cusparsestruct)->cooPerm_a; 3708 delete (*cusparsestruct)->csr2csc_i; 3709 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3710 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3711 } 3712 PetscFunctionReturn(0); 3713 } 3714 3715 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3716 { 3717 PetscFunctionBegin; 3718 if (*mat) { 3719 delete (*mat)->values; 3720 delete (*mat)->column_indices; 3721 delete (*mat)->row_offsets; 3722 delete *mat; 3723 *mat = 0; 3724 } 3725 PetscFunctionReturn(0); 3726 } 3727 3728 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3729 { 3730 cusparseStatus_t stat; 3731 PetscErrorCode ierr; 3732 3733 PetscFunctionBegin; 3734 if (*trifactor) { 3735 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3736 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3737 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3738 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3739 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3740 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3741 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3742 #endif 3743 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3744 } 3745 PetscFunctionReturn(0); 3746 } 3747 3748 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3749 { 3750 CsrMatrix *mat; 3751 cusparseStatus_t stat; 3752 cudaError_t err; 3753 3754 PetscFunctionBegin; 3755 if (*matstruct) { 3756 if ((*matstruct)->mat) { 3757 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3758 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3759 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3760 #else 3761 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3762 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3763 #endif 3764 } else { 3765 mat = (CsrMatrix*)(*matstruct)->mat; 3766 CsrMatrix_Destroy(&mat); 3767 } 3768 } 3769 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3770 delete (*matstruct)->cprowIndices; 3771 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3772 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3773 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3774 3775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3776 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3777 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3778 for (int i=0; i<3; i++) { 3779 if (mdata->cuSpMV[i].initialized) { 3780 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3781 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3782 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3783 } 3784 } 3785 #endif 3786 delete *matstruct; 3787 *matstruct = NULL; 3788 } 3789 PetscFunctionReturn(0); 3790 } 3791 3792 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3793 { 3794 PetscErrorCode ierr; 3795 3796 PetscFunctionBegin; 3797 if (*trifactors) { 3798 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3799 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3800 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3801 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3802 delete (*trifactors)->rpermIndices; 3803 delete (*trifactors)->cpermIndices; 3804 delete (*trifactors)->workVector; 3805 (*trifactors)->rpermIndices = NULL; 3806 (*trifactors)->cpermIndices = NULL; 3807 (*trifactors)->workVector = NULL; 3808 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3809 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3810 (*trifactors)->init_dev_prop = PETSC_FALSE; 3811 } 3812 PetscFunctionReturn(0); 3813 } 3814 3815 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3816 { 3817 PetscErrorCode ierr; 3818 cusparseHandle_t handle; 3819 cusparseStatus_t stat; 3820 3821 PetscFunctionBegin; 3822 if (*trifactors) { 3823 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3824 if (handle = (*trifactors)->handle) { 3825 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3826 } 3827 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3828 } 3829 PetscFunctionReturn(0); 3830 } 3831 3832 struct IJCompare 3833 { 3834 __host__ __device__ 3835 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3836 { 3837 if (t1.get<0>() < t2.get<0>()) return true; 3838 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3839 return false; 3840 } 3841 }; 3842 3843 struct IJEqual 3844 { 3845 __host__ __device__ 3846 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3847 { 3848 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3849 return true; 3850 } 3851 }; 3852 3853 struct IJDiff 3854 { 3855 __host__ __device__ 3856 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3857 { 3858 return t1 == t2 ? 0 : 1; 3859 } 3860 }; 3861 3862 struct IJSum 3863 { 3864 __host__ __device__ 3865 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3866 { 3867 return t1||t2; 3868 } 3869 }; 3870 3871 #include <thrust/iterator/discard_iterator.h> 3872 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3873 { 3874 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3875 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3876 THRUSTARRAY *cooPerm_v = NULL; 3877 thrust::device_ptr<const PetscScalar> d_v; 3878 CsrMatrix *matrix; 3879 PetscErrorCode ierr; 3880 PetscInt n; 3881 3882 PetscFunctionBegin; 3883 PetscCheckFalse(!cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3884 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3885 if (!cusp->cooPerm) { 3886 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3887 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3888 PetscFunctionReturn(0); 3889 } 3890 matrix = (CsrMatrix*)cusp->mat->mat; 3891 PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3892 if (!v) { 3893 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3894 goto finalize; 3895 } 3896 n = cusp->cooPerm->size(); 3897 if (isCudaMem(v)) { 3898 d_v = thrust::device_pointer_cast(v); 3899 } else { 3900 cooPerm_v = new THRUSTARRAY(n); 3901 cooPerm_v->assign(v,v+n); 3902 d_v = cooPerm_v->data(); 3903 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3904 } 3905 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3906 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3907 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3908 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3909 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3910 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3911 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3912 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3913 */ 3914 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3915 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3916 delete cooPerm_w; 3917 } else { 3918 /* all nonzeros in d_v[] are unique entries */ 3919 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3920 matrix->values->begin())); 3921 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3922 matrix->values->end())); 3923 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3924 } 3925 } else { 3926 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3927 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3928 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3929 } else { 3930 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3931 matrix->values->begin())); 3932 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3933 matrix->values->end())); 3934 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3935 } 3936 } 3937 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3938 finalize: 3939 delete cooPerm_v; 3940 A->offloadmask = PETSC_OFFLOAD_GPU; 3941 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3942 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3943 ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3944 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3945 ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr); 3946 a->reallocs = 0; 3947 A->info.mallocs += 0; 3948 A->info.nz_unneeded = 0; 3949 A->assembled = A->was_assembled = PETSC_TRUE; 3950 A->num_ass++; 3951 PetscFunctionReturn(0); 3952 } 3953 3954 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3955 { 3956 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3957 PetscErrorCode ierr; 3958 3959 PetscFunctionBegin; 3960 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3961 if (!cusp) PetscFunctionReturn(0); 3962 if (destroy) { 3963 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3964 delete cusp->csr2csc_i; 3965 cusp->csr2csc_i = NULL; 3966 } 3967 A->transupdated = PETSC_FALSE; 3968 PetscFunctionReturn(0); 3969 } 3970 3971 #include <thrust/binary_search.h> 3972 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3973 { 3974 PetscErrorCode ierr; 3975 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3976 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3977 PetscInt cooPerm_n, nzr = 0; 3978 cudaError_t cerr; 3979 3980 PetscFunctionBegin; 3981 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3982 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3983 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3984 if (n != cooPerm_n) { 3985 delete cusp->cooPerm; 3986 delete cusp->cooPerm_a; 3987 cusp->cooPerm = NULL; 3988 cusp->cooPerm_a = NULL; 3989 } 3990 if (n) { 3991 THRUSTINTARRAY d_i(n); 3992 THRUSTINTARRAY d_j(n); 3993 THRUSTINTARRAY ii(A->rmap->n); 3994 3995 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3996 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3997 3998 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3999 d_i.assign(coo_i,coo_i+n); 4000 d_j.assign(coo_j,coo_j+n); 4001 4002 /* Ex. 4003 n = 6 4004 coo_i = [3,3,1,4,1,4] 4005 coo_j = [3,2,2,5,2,6] 4006 */ 4007 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4008 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4009 4010 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4011 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4012 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4013 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4014 THRUSTINTARRAY w = d_j; 4015 4016 /* 4017 d_i = [1,1,3,3,4,4] 4018 d_j = [2,2,2,3,5,6] 4019 cooPerm = [2,4,1,0,3,5] 4020 */ 4021 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4022 4023 /* 4024 d_i = [1,3,3,4,4,x] 4025 ^ekey 4026 d_j = [2,2,3,5,6,x] 4027 ^nekye 4028 */ 4029 if (nekey == ekey) { /* all entries are unique */ 4030 delete cusp->cooPerm_a; 4031 cusp->cooPerm_a = NULL; 4032 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4033 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4034 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4035 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4036 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4037 w[0] = 0; 4038 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4039 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4040 } 4041 thrust::counting_iterator<PetscInt> search_begin(0); 4042 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4043 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4044 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4045 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4046 4047 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 4048 a->singlemalloc = PETSC_FALSE; 4049 a->free_a = PETSC_TRUE; 4050 a->free_ij = PETSC_TRUE; 4051 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4052 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4053 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4054 a->nz = a->maxnz = a->i[A->rmap->n]; 4055 a->rmax = 0; 4056 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4057 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4058 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4059 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4060 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4061 for (PetscInt i = 0; i < A->rmap->n; i++) { 4062 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4063 nzr += (PetscInt)!!(nnzr); 4064 a->ilen[i] = a->imax[i] = nnzr; 4065 a->rmax = PetscMax(a->rmax,nnzr); 4066 } 4067 a->nonzerorowcnt = nzr; 4068 A->preallocated = PETSC_TRUE; 4069 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4070 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4071 } else { 4072 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4073 } 4074 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4075 4076 /* We want to allocate the CUSPARSE struct for matvec now. 4077 The code is so convoluted now that I prefer to copy zeros */ 4078 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4079 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4080 A->offloadmask = PETSC_OFFLOAD_CPU; 4081 A->nonzerostate++; 4082 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4083 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4084 4085 A->assembled = PETSC_FALSE; 4086 A->was_assembled = PETSC_FALSE; 4087 PetscFunctionReturn(0); 4088 } 4089 4090 /*@C 4091 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4092 4093 Not collective 4094 4095 Input Parameters: 4096 + A - the matrix 4097 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4098 4099 Output Parameters: 4100 + ia - the CSR row pointers 4101 - ja - the CSR column indices 4102 4103 Level: developer 4104 4105 Notes: 4106 When compressed is true, the CSR structure does not contain empty rows 4107 4108 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4109 @*/ 4110 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4111 { 4112 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4113 CsrMatrix *csr; 4114 PetscErrorCode ierr; 4115 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4116 4117 PetscFunctionBegin; 4118 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4119 if (!i || !j) PetscFunctionReturn(0); 4120 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4121 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4122 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4123 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4124 csr = (CsrMatrix*)cusp->mat->mat; 4125 if (i) { 4126 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4127 if (!cusp->rowoffsets_gpu) { 4128 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4129 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4130 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4131 } 4132 *i = cusp->rowoffsets_gpu->data().get(); 4133 } else *i = csr->row_offsets->data().get(); 4134 } 4135 if (j) *j = csr->column_indices->data().get(); 4136 PetscFunctionReturn(0); 4137 } 4138 4139 /*@C 4140 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4141 4142 Not collective 4143 4144 Input Parameters: 4145 + A - the matrix 4146 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4147 4148 Output Parameters: 4149 + ia - the CSR row pointers 4150 - ja - the CSR column indices 4151 4152 Level: developer 4153 4154 .seealso: MatSeqAIJCUSPARSEGetIJ() 4155 @*/ 4156 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4157 { 4158 PetscFunctionBegin; 4159 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4160 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4161 if (i) *i = NULL; 4162 if (j) *j = NULL; 4163 PetscFunctionReturn(0); 4164 } 4165 4166 /*@C 4167 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4168 4169 Not Collective 4170 4171 Input Parameter: 4172 . A - a MATSEQAIJCUSPARSE matrix 4173 4174 Output Parameter: 4175 . a - pointer to the device data 4176 4177 Level: developer 4178 4179 Notes: may trigger host-device copies if up-to-date matrix data is on host 4180 4181 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4182 @*/ 4183 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4184 { 4185 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4186 CsrMatrix *csr; 4187 PetscErrorCode ierr; 4188 4189 PetscFunctionBegin; 4190 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4191 PetscValidPointer(a,2); 4192 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4193 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4194 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4195 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4196 csr = (CsrMatrix*)cusp->mat->mat; 4197 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4198 *a = csr->values->data().get(); 4199 PetscFunctionReturn(0); 4200 } 4201 4202 /*@C 4203 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4204 4205 Not Collective 4206 4207 Input Parameter: 4208 . A - a MATSEQAIJCUSPARSE matrix 4209 4210 Output Parameter: 4211 . a - pointer to the device data 4212 4213 Level: developer 4214 4215 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4216 @*/ 4217 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4218 { 4219 PetscFunctionBegin; 4220 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4221 PetscValidPointer(a,2); 4222 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4223 *a = NULL; 4224 PetscFunctionReturn(0); 4225 } 4226 4227 /*@C 4228 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4229 4230 Not Collective 4231 4232 Input Parameter: 4233 . A - a MATSEQAIJCUSPARSE matrix 4234 4235 Output Parameter: 4236 . a - pointer to the device data 4237 4238 Level: developer 4239 4240 Notes: may trigger host-device copies if up-to-date matrix data is on host 4241 4242 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4243 @*/ 4244 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4245 { 4246 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4247 CsrMatrix *csr; 4248 PetscErrorCode ierr; 4249 4250 PetscFunctionBegin; 4251 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4252 PetscValidPointer(a,2); 4253 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4254 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4255 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4256 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4257 csr = (CsrMatrix*)cusp->mat->mat; 4258 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4259 *a = csr->values->data().get(); 4260 A->offloadmask = PETSC_OFFLOAD_GPU; 4261 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4262 PetscFunctionReturn(0); 4263 } 4264 /*@C 4265 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4266 4267 Not Collective 4268 4269 Input Parameter: 4270 . A - a MATSEQAIJCUSPARSE matrix 4271 4272 Output Parameter: 4273 . a - pointer to the device data 4274 4275 Level: developer 4276 4277 .seealso: MatSeqAIJCUSPARSEGetArray() 4278 @*/ 4279 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4280 { 4281 PetscErrorCode ierr; 4282 4283 PetscFunctionBegin; 4284 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4285 PetscValidPointer(a,2); 4286 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4287 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4288 *a = NULL; 4289 PetscFunctionReturn(0); 4290 } 4291 4292 /*@C 4293 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4294 4295 Not Collective 4296 4297 Input Parameter: 4298 . A - a MATSEQAIJCUSPARSE matrix 4299 4300 Output Parameter: 4301 . a - pointer to the device data 4302 4303 Level: developer 4304 4305 Notes: does not trigger host-device copies and flags data validity on the GPU 4306 4307 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4308 @*/ 4309 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4310 { 4311 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4312 CsrMatrix *csr; 4313 PetscErrorCode ierr; 4314 4315 PetscFunctionBegin; 4316 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4317 PetscValidPointer(a,2); 4318 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4319 PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4320 PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4321 csr = (CsrMatrix*)cusp->mat->mat; 4322 PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4323 *a = csr->values->data().get(); 4324 A->offloadmask = PETSC_OFFLOAD_GPU; 4325 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4326 PetscFunctionReturn(0); 4327 } 4328 4329 /*@C 4330 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4331 4332 Not Collective 4333 4334 Input Parameter: 4335 . A - a MATSEQAIJCUSPARSE matrix 4336 4337 Output Parameter: 4338 . a - pointer to the device data 4339 4340 Level: developer 4341 4342 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4343 @*/ 4344 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4345 { 4346 PetscErrorCode ierr; 4347 4348 PetscFunctionBegin; 4349 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4350 PetscValidPointer(a,2); 4351 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4352 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4353 *a = NULL; 4354 PetscFunctionReturn(0); 4355 } 4356 4357 struct IJCompare4 4358 { 4359 __host__ __device__ 4360 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4361 { 4362 if (t1.get<0>() < t2.get<0>()) return true; 4363 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4364 return false; 4365 } 4366 }; 4367 4368 struct Shift 4369 { 4370 int _shift; 4371 4372 Shift(int shift) : _shift(shift) {} 4373 __host__ __device__ 4374 inline int operator() (const int &c) 4375 { 4376 return c + _shift; 4377 } 4378 }; 4379 4380 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4381 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4382 { 4383 PetscErrorCode ierr; 4384 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4385 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4386 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4387 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4388 PetscInt Annz,Bnnz; 4389 cusparseStatus_t stat; 4390 PetscInt i,m,n,zero = 0; 4391 cudaError_t cerr; 4392 4393 PetscFunctionBegin; 4394 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4395 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4396 PetscValidPointer(C,4); 4397 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4398 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4399 PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4400 PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4401 PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4402 PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4403 if (reuse == MAT_INITIAL_MATRIX) { 4404 m = A->rmap->n; 4405 n = A->cmap->n + B->cmap->n; 4406 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4407 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4408 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4409 c = (Mat_SeqAIJ*)(*C)->data; 4410 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4411 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4412 Ccsr = new CsrMatrix; 4413 Cmat->cprowIndices = NULL; 4414 c->compressedrow.use = PETSC_FALSE; 4415 c->compressedrow.nrows = 0; 4416 c->compressedrow.i = NULL; 4417 c->compressedrow.rindex = NULL; 4418 Ccusp->workVector = NULL; 4419 Ccusp->nrows = m; 4420 Ccusp->mat = Cmat; 4421 Ccusp->mat->mat = Ccsr; 4422 Ccsr->num_rows = m; 4423 Ccsr->num_cols = n; 4424 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4425 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4426 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4427 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4428 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4429 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4430 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4431 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4432 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4433 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4434 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4435 PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4436 PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4437 4438 Acsr = (CsrMatrix*)Acusp->mat->mat; 4439 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4440 Annz = (PetscInt)Acsr->column_indices->size(); 4441 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4442 c->nz = Annz + Bnnz; 4443 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4444 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4445 Ccsr->values = new THRUSTARRAY(c->nz); 4446 Ccsr->num_entries = c->nz; 4447 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4448 if (c->nz) { 4449 auto Acoo = new THRUSTINTARRAY32(Annz); 4450 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4451 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4452 THRUSTINTARRAY32 *Aroff,*Broff; 4453 4454 if (a->compressedrow.use) { /* need full row offset */ 4455 if (!Acusp->rowoffsets_gpu) { 4456 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4457 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4458 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4459 } 4460 Aroff = Acusp->rowoffsets_gpu; 4461 } else Aroff = Acsr->row_offsets; 4462 if (b->compressedrow.use) { /* need full row offset */ 4463 if (!Bcusp->rowoffsets_gpu) { 4464 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4465 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4466 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4467 } 4468 Broff = Bcusp->rowoffsets_gpu; 4469 } else Broff = Bcsr->row_offsets; 4470 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4471 stat = cusparseXcsr2coo(Acusp->handle, 4472 Aroff->data().get(), 4473 Annz, 4474 m, 4475 Acoo->data().get(), 4476 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4477 stat = cusparseXcsr2coo(Bcusp->handle, 4478 Broff->data().get(), 4479 Bnnz, 4480 m, 4481 Bcoo->data().get(), 4482 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4483 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4484 auto Aperm = thrust::make_constant_iterator(1); 4485 auto Bperm = thrust::make_constant_iterator(0); 4486 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4487 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4488 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4489 #else 4490 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4491 auto Bcib = Bcsr->column_indices->begin(); 4492 auto Bcie = Bcsr->column_indices->end(); 4493 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4494 #endif 4495 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4496 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4497 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4498 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4499 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4500 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4501 auto p1 = Ccusp->cooPerm->begin(); 4502 auto p2 = Ccusp->cooPerm->begin(); 4503 thrust::advance(p2,Annz); 4504 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4505 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4506 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4507 #endif 4508 auto cci = thrust::make_counting_iterator(zero); 4509 auto cce = thrust::make_counting_iterator(c->nz); 4510 #if 0 //Errors on SUMMIT cuda 11.1.0 4511 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4512 #else 4513 auto pred = thrust::identity<int>(); 4514 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4515 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4516 #endif 4517 stat = cusparseXcoo2csr(Ccusp->handle, 4518 Ccoo->data().get(), 4519 c->nz, 4520 m, 4521 Ccsr->row_offsets->data().get(), 4522 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4523 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4524 delete wPerm; 4525 delete Acoo; 4526 delete Bcoo; 4527 delete Ccoo; 4528 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4529 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4530 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4531 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4532 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4533 #endif 4534 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4535 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4536 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4537 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4538 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4539 CsrMatrix *CcsrT = new CsrMatrix; 4540 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4541 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4542 4543 (*C)->form_explicit_transpose = PETSC_TRUE; 4544 (*C)->transupdated = PETSC_TRUE; 4545 Ccusp->rowoffsets_gpu = NULL; 4546 CmatT->cprowIndices = NULL; 4547 CmatT->mat = CcsrT; 4548 CcsrT->num_rows = n; 4549 CcsrT->num_cols = m; 4550 CcsrT->num_entries = c->nz; 4551 4552 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4553 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4554 CcsrT->values = new THRUSTARRAY(c->nz); 4555 4556 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4557 auto rT = CcsrT->row_offsets->begin(); 4558 if (AT) { 4559 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4560 thrust::advance(rT,-1); 4561 } 4562 if (BT) { 4563 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4564 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4565 thrust::copy(titb,tite,rT); 4566 } 4567 auto cT = CcsrT->column_indices->begin(); 4568 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4569 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4570 auto vT = CcsrT->values->begin(); 4571 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4572 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4573 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4574 4575 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4576 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4577 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4578 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4579 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4580 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4581 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4582 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4583 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4584 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4585 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4586 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4587 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4588 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4589 #endif 4590 Ccusp->matTranspose = CmatT; 4591 } 4592 } 4593 4594 c->singlemalloc = PETSC_FALSE; 4595 c->free_a = PETSC_TRUE; 4596 c->free_ij = PETSC_TRUE; 4597 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4598 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4599 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4600 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4601 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4602 ii = *Ccsr->row_offsets; 4603 jj = *Ccsr->column_indices; 4604 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4605 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4606 } else { 4607 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4608 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4609 } 4610 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4611 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4612 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4613 c->maxnz = c->nz; 4614 c->nonzerorowcnt = 0; 4615 c->rmax = 0; 4616 for (i = 0; i < m; i++) { 4617 const PetscInt nn = c->i[i+1] - c->i[i]; 4618 c->ilen[i] = c->imax[i] = nn; 4619 c->nonzerorowcnt += (PetscInt)!!nn; 4620 c->rmax = PetscMax(c->rmax,nn); 4621 } 4622 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4623 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4624 (*C)->nonzerostate++; 4625 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4626 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4627 Ccusp->nonzerostate = (*C)->nonzerostate; 4628 (*C)->preallocated = PETSC_TRUE; 4629 } else { 4630 PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4631 c = (Mat_SeqAIJ*)(*C)->data; 4632 if (c->nz) { 4633 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4634 PetscCheckFalse(!Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4635 PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4636 PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4637 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4638 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4639 PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4640 PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4641 Acsr = (CsrMatrix*)Acusp->mat->mat; 4642 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4643 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4644 PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4645 PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4646 PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4647 PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4648 PetscCheckFalse(Ccusp->cooPerm->size() != Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4649 auto pmid = Ccusp->cooPerm->begin(); 4650 thrust::advance(pmid,Acsr->num_entries); 4651 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4652 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4653 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4654 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4655 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4656 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4657 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4658 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4659 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4660 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4661 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4662 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4663 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4664 PetscCheckFalse(!Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4665 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4666 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4667 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4668 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4669 auto vT = CcsrT->values->begin(); 4670 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4671 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4672 (*C)->transupdated = PETSC_TRUE; 4673 } 4674 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4675 } 4676 } 4677 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4678 (*C)->assembled = PETSC_TRUE; 4679 (*C)->was_assembled = PETSC_FALSE; 4680 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4681 PetscFunctionReturn(0); 4682 } 4683 4684 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4685 { 4686 PetscErrorCode ierr; 4687 bool dmem; 4688 const PetscScalar *av; 4689 cudaError_t cerr; 4690 4691 PetscFunctionBegin; 4692 dmem = isCudaMem(v); 4693 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4694 if (n && idx) { 4695 THRUSTINTARRAY widx(n); 4696 widx.assign(idx,idx+n); 4697 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4698 4699 THRUSTARRAY *w = NULL; 4700 thrust::device_ptr<PetscScalar> dv; 4701 if (dmem) { 4702 dv = thrust::device_pointer_cast(v); 4703 } else { 4704 w = new THRUSTARRAY(n); 4705 dv = w->data(); 4706 } 4707 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4708 4709 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4710 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4711 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4712 if (w) { 4713 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4714 } 4715 delete w; 4716 } else { 4717 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4718 } 4719 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4720 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4721 PetscFunctionReturn(0); 4722 } 4723