1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 if (!A->boundtocpu) { 169 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 170 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 171 } else { 172 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 173 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 174 } 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 178 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 179 if (!A->boundtocpu) { 180 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 181 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 182 } else { 183 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 184 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 185 } 186 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 187 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 188 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 189 190 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 191 (*B)->canuseordering = PETSC_TRUE; 192 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 197 { 198 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 199 200 PetscFunctionBegin; 201 switch (op) { 202 case MAT_CUSPARSE_MULT: 203 cusparsestruct->format = format; 204 break; 205 case MAT_CUSPARSE_ALL: 206 cusparsestruct->format = format; 207 break; 208 default: 209 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 210 } 211 PetscFunctionReturn(0); 212 } 213 214 /*@ 215 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 216 operation. Only the MatMult operation can use different GPU storage formats 217 for MPIAIJCUSPARSE matrices. 218 Not Collective 219 220 Input Parameters: 221 + A - Matrix of type SEQAIJCUSPARSE 222 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 223 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 224 225 Output Parameter: 226 227 Level: intermediate 228 229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 230 @*/ 231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 237 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 242 { 243 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 244 245 PetscFunctionBegin; 246 cusparsestruct->use_cpu_solve = use_cpu; 247 PetscFunctionReturn(0); 248 } 249 250 /*@ 251 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 252 253 Input Parameters: 254 + A - Matrix of type SEQAIJCUSPARSE 255 - use_cpu - set flag for using the built-in CPU MatSolve 256 257 Output Parameter: 258 259 Notes: 260 The cuSparse LU solver currently computes the factors with the built-in CPU method 261 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 262 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 263 264 Level: intermediate 265 266 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 267 @*/ 268 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 269 { 270 PetscErrorCode ierr; 271 272 PetscFunctionBegin; 273 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 274 ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 275 PetscFunctionReturn(0); 276 } 277 278 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 279 { 280 PetscErrorCode ierr; 281 282 PetscFunctionBegin; 283 switch (op) { 284 case MAT_FORM_EXPLICIT_TRANSPOSE: 285 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 286 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 287 A->form_explicit_transpose = flg; 288 break; 289 default: 290 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 291 break; 292 } 293 PetscFunctionReturn(0); 294 } 295 296 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 297 298 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 299 { 300 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 301 IS isrow = b->row,iscol = b->col; 302 PetscBool row_identity,col_identity; 303 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 304 PetscErrorCode ierr; 305 306 PetscFunctionBegin; 307 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 308 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 309 B->offloadmask = PETSC_OFFLOAD_CPU; 310 /* determine which version of MatSolve needs to be used. */ 311 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 312 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 313 if (row_identity && col_identity) { 314 if (!cusparsestruct->use_cpu_solve) { 315 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 316 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 317 } 318 B->ops->matsolve = NULL; 319 B->ops->matsolvetranspose = NULL; 320 } else { 321 if (!cusparsestruct->use_cpu_solve) { 322 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 323 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 324 } 325 B->ops->matsolve = NULL; 326 B->ops->matsolvetranspose = NULL; 327 } 328 329 /* get the triangular factors */ 330 if (!cusparsestruct->use_cpu_solve) { 331 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 332 } 333 PetscFunctionReturn(0); 334 } 335 336 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 337 { 338 PetscErrorCode ierr; 339 MatCUSPARSEStorageFormat format; 340 PetscBool flg; 341 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 342 343 PetscFunctionBegin; 344 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 345 if (A->factortype == MAT_FACTOR_NONE) { 346 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 347 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 348 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 349 350 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 351 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 352 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 353 ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 354 if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 355 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 356 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 357 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 358 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 359 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 360 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 361 #else 362 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 363 #endif 364 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 365 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 366 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 367 368 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 369 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 370 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 371 #endif 372 } 373 ierr = PetscOptionsTail();CHKERRQ(ierr); 374 PetscFunctionReturn(0); 375 } 376 377 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 378 { 379 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 384 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 385 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 386 PetscFunctionReturn(0); 387 } 388 389 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 390 { 391 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 392 PetscErrorCode ierr; 393 394 PetscFunctionBegin; 395 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 396 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 397 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 398 PetscFunctionReturn(0); 399 } 400 401 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 402 { 403 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 404 PetscErrorCode ierr; 405 406 PetscFunctionBegin; 407 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 408 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 409 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 410 PetscFunctionReturn(0); 411 } 412 413 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 414 { 415 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 416 PetscErrorCode ierr; 417 418 PetscFunctionBegin; 419 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 420 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 421 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 422 PetscFunctionReturn(0); 423 } 424 425 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 426 { 427 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 428 PetscInt n = A->rmap->n; 429 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 430 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 431 cusparseStatus_t stat; 432 const PetscInt *ai = a->i,*aj = a->j,*vi; 433 const MatScalar *aa = a->a,*v; 434 PetscInt *AiLo, *AjLo; 435 PetscInt i,nz, nzLower, offset, rowOffset; 436 PetscErrorCode ierr; 437 cudaError_t cerr; 438 439 PetscFunctionBegin; 440 if (!n) PetscFunctionReturn(0); 441 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 442 try { 443 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 444 nzLower=n+ai[n]-ai[1]; 445 if (!loTriFactor) { 446 PetscScalar *AALo; 447 448 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 449 450 /* Allocate Space for the lower triangular matrix */ 451 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 452 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 453 454 /* Fill the lower triangular matrix */ 455 AiLo[0] = (PetscInt) 0; 456 AiLo[n] = nzLower; 457 AjLo[0] = (PetscInt) 0; 458 AALo[0] = (MatScalar) 1.0; 459 v = aa; 460 vi = aj; 461 offset = 1; 462 rowOffset= 1; 463 for (i=1; i<n; i++) { 464 nz = ai[i+1] - ai[i]; 465 /* additional 1 for the term on the diagonal */ 466 AiLo[i] = rowOffset; 467 rowOffset += nz+1; 468 469 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 470 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 471 472 offset += nz; 473 AjLo[offset] = (PetscInt) i; 474 AALo[offset] = (MatScalar) 1.0; 475 offset += 1; 476 477 v += nz; 478 vi += nz; 479 } 480 481 /* allocate space for the triangular factor information */ 482 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 483 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 484 /* Create the matrix description */ 485 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 486 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 487 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 488 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 489 #else 490 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 491 #endif 492 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 493 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 494 495 /* set the operation */ 496 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 497 498 /* set the matrix */ 499 loTriFactor->csrMat = new CsrMatrix; 500 loTriFactor->csrMat->num_rows = n; 501 loTriFactor->csrMat->num_cols = n; 502 loTriFactor->csrMat->num_entries = nzLower; 503 504 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 505 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 506 507 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 508 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 509 510 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 511 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 512 513 /* Create the solve analysis information */ 514 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 515 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 516 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 517 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 518 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 519 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 520 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 521 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 522 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 523 #endif 524 525 /* perform the solve analysis */ 526 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 527 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 528 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 529 loTriFactor->csrMat->column_indices->data().get(), 530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 531 loTriFactor->solveInfo, 532 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 533 #else 534 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 535 #endif 536 cerr = WaitForCUDA();CHKERRCUDA(cerr); 537 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 538 539 /* assign the pointer */ 540 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 541 loTriFactor->AA_h = AALo; 542 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 543 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 544 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 545 } else { /* update values only */ 546 if (!loTriFactor->AA_h) { 547 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 548 } 549 /* Fill the lower triangular matrix */ 550 loTriFactor->AA_h[0] = 1.0; 551 v = aa; 552 vi = aj; 553 offset = 1; 554 for (i=1; i<n; i++) { 555 nz = ai[i+1] - ai[i]; 556 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 557 offset += nz; 558 loTriFactor->AA_h[offset] = 1.0; 559 offset += 1; 560 v += nz; 561 } 562 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 563 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 564 } 565 } catch(char *ex) { 566 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 567 } 568 } 569 PetscFunctionReturn(0); 570 } 571 572 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 573 { 574 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 575 PetscInt n = A->rmap->n; 576 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 577 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 578 cusparseStatus_t stat; 579 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 580 const MatScalar *aa = a->a,*v; 581 PetscInt *AiUp, *AjUp; 582 PetscInt i,nz, nzUpper, offset; 583 PetscErrorCode ierr; 584 cudaError_t cerr; 585 586 PetscFunctionBegin; 587 if (!n) PetscFunctionReturn(0); 588 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 589 try { 590 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 591 nzUpper = adiag[0]-adiag[n]; 592 if (!upTriFactor) { 593 PetscScalar *AAUp; 594 595 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 596 597 /* Allocate Space for the upper triangular matrix */ 598 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 599 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 600 601 /* Fill the upper triangular matrix */ 602 AiUp[0]=(PetscInt) 0; 603 AiUp[n]=nzUpper; 604 offset = nzUpper; 605 for (i=n-1; i>=0; i--) { 606 v = aa + adiag[i+1] + 1; 607 vi = aj + adiag[i+1] + 1; 608 609 /* number of elements NOT on the diagonal */ 610 nz = adiag[i] - adiag[i+1]-1; 611 612 /* decrement the offset */ 613 offset -= (nz+1); 614 615 /* first, set the diagonal elements */ 616 AjUp[offset] = (PetscInt) i; 617 AAUp[offset] = (MatScalar)1./v[nz]; 618 AiUp[i] = AiUp[i+1] - (nz+1); 619 620 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 621 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 622 } 623 624 /* allocate space for the triangular factor information */ 625 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 626 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 627 628 /* Create the matrix description */ 629 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 630 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 631 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 632 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 633 #else 634 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 635 #endif 636 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 637 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 638 639 /* set the operation */ 640 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 641 642 /* set the matrix */ 643 upTriFactor->csrMat = new CsrMatrix; 644 upTriFactor->csrMat->num_rows = n; 645 upTriFactor->csrMat->num_cols = n; 646 upTriFactor->csrMat->num_entries = nzUpper; 647 648 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 649 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 650 651 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 652 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 653 654 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 655 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 656 657 /* Create the solve analysis information */ 658 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 659 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 660 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 661 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 662 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 663 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 664 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 665 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 666 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 667 #endif 668 669 /* perform the solve analysis */ 670 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 671 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 672 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 673 upTriFactor->csrMat->column_indices->data().get(), 674 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 675 upTriFactor->solveInfo, 676 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 677 #else 678 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 679 #endif 680 cerr = WaitForCUDA();CHKERRCUDA(cerr); 681 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 682 683 /* assign the pointer */ 684 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 685 upTriFactor->AA_h = AAUp; 686 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 687 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 688 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 689 } else { 690 if (!upTriFactor->AA_h) { 691 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 692 } 693 /* Fill the upper triangular matrix */ 694 offset = nzUpper; 695 for (i=n-1; i>=0; i--) { 696 v = aa + adiag[i+1] + 1; 697 698 /* number of elements NOT on the diagonal */ 699 nz = adiag[i] - adiag[i+1]-1; 700 701 /* decrement the offset */ 702 offset -= (nz+1); 703 704 /* first, set the diagonal elements */ 705 upTriFactor->AA_h[offset] = 1./v[nz]; 706 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 707 } 708 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 709 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 710 } 711 } catch(char *ex) { 712 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 713 } 714 } 715 PetscFunctionReturn(0); 716 } 717 718 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 719 { 720 PetscErrorCode ierr; 721 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 722 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 723 IS isrow = a->row,iscol = a->icol; 724 PetscBool row_identity,col_identity; 725 PetscInt n = A->rmap->n; 726 727 PetscFunctionBegin; 728 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 729 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 730 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 731 732 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 733 cusparseTriFactors->nnz=a->nz; 734 735 A->offloadmask = PETSC_OFFLOAD_BOTH; 736 /* lower triangular indices */ 737 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 738 if (!row_identity && !cusparseTriFactors->rpermIndices) { 739 const PetscInt *r; 740 741 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 742 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 743 cusparseTriFactors->rpermIndices->assign(r, r+n); 744 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 745 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 746 } 747 748 /* upper triangular indices */ 749 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 750 if (!col_identity && !cusparseTriFactors->cpermIndices) { 751 const PetscInt *c; 752 753 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 754 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 755 cusparseTriFactors->cpermIndices->assign(c, c+n); 756 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 757 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 758 } 759 PetscFunctionReturn(0); 760 } 761 762 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 763 { 764 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 765 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 766 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 767 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 768 cusparseStatus_t stat; 769 PetscErrorCode ierr; 770 cudaError_t cerr; 771 PetscInt *AiUp, *AjUp; 772 PetscScalar *AAUp; 773 PetscScalar *AALo; 774 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 775 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 776 const PetscInt *ai = b->i,*aj = b->j,*vj; 777 const MatScalar *aa = b->a,*v; 778 779 PetscFunctionBegin; 780 if (!n) PetscFunctionReturn(0); 781 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 782 try { 783 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 784 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 785 if (!upTriFactor && !loTriFactor) { 786 /* Allocate Space for the upper triangular matrix */ 787 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 788 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 789 790 /* Fill the upper triangular matrix */ 791 AiUp[0]=(PetscInt) 0; 792 AiUp[n]=nzUpper; 793 offset = 0; 794 for (i=0; i<n; i++) { 795 /* set the pointers */ 796 v = aa + ai[i]; 797 vj = aj + ai[i]; 798 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 799 800 /* first, set the diagonal elements */ 801 AjUp[offset] = (PetscInt) i; 802 AAUp[offset] = (MatScalar)1.0/v[nz]; 803 AiUp[i] = offset; 804 AALo[offset] = (MatScalar)1.0/v[nz]; 805 806 offset+=1; 807 if (nz>0) { 808 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 809 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 810 for (j=offset; j<offset+nz; j++) { 811 AAUp[j] = -AAUp[j]; 812 AALo[j] = AAUp[j]/v[nz]; 813 } 814 offset+=nz; 815 } 816 } 817 818 /* allocate space for the triangular factor information */ 819 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 820 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 821 822 /* Create the matrix description */ 823 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 824 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 825 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 826 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 827 #else 828 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 829 #endif 830 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 831 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 832 833 /* set the matrix */ 834 upTriFactor->csrMat = new CsrMatrix; 835 upTriFactor->csrMat->num_rows = A->rmap->n; 836 upTriFactor->csrMat->num_cols = A->cmap->n; 837 upTriFactor->csrMat->num_entries = a->nz; 838 839 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841 842 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844 845 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 847 848 /* set the operation */ 849 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 850 851 /* Create the solve analysis information */ 852 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 853 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 854 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 855 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 856 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 857 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 858 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 859 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 860 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 861 #endif 862 863 /* perform the solve analysis */ 864 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 865 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 866 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 867 upTriFactor->csrMat->column_indices->data().get(), 868 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 869 upTriFactor->solveInfo, 870 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 871 #else 872 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 873 #endif 874 cerr = WaitForCUDA();CHKERRCUDA(cerr); 875 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 876 877 /* assign the pointer */ 878 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 879 880 /* allocate space for the triangular factor information */ 881 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 882 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 883 884 /* Create the matrix description */ 885 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 886 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 887 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 888 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 889 #else 890 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 891 #endif 892 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 893 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 894 895 /* set the operation */ 896 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 897 898 /* set the matrix */ 899 loTriFactor->csrMat = new CsrMatrix; 900 loTriFactor->csrMat->num_rows = A->rmap->n; 901 loTriFactor->csrMat->num_cols = A->cmap->n; 902 loTriFactor->csrMat->num_entries = a->nz; 903 904 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 906 907 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 908 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 909 910 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 911 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 912 913 /* Create the solve analysis information */ 914 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 915 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 916 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 917 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 918 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 919 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 920 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 921 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 922 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 923 #endif 924 925 /* perform the solve analysis */ 926 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 927 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 928 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 929 loTriFactor->csrMat->column_indices->data().get(), 930 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 931 loTriFactor->solveInfo, 932 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 933 #else 934 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 935 #endif 936 cerr = WaitForCUDA();CHKERRCUDA(cerr); 937 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 938 939 /* assign the pointer */ 940 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 941 942 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 943 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 944 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 945 } else { 946 /* Fill the upper triangular matrix */ 947 offset = 0; 948 for (i=0; i<n; i++) { 949 /* set the pointers */ 950 v = aa + ai[i]; 951 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 952 953 /* first, set the diagonal elements */ 954 AAUp[offset] = 1.0/v[nz]; 955 AALo[offset] = 1.0/v[nz]; 956 957 offset+=1; 958 if (nz>0) { 959 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 960 for (j=offset; j<offset+nz; j++) { 961 AAUp[j] = -AAUp[j]; 962 AALo[j] = AAUp[j]/v[nz]; 963 } 964 offset+=nz; 965 } 966 } 967 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 968 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 969 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 970 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 971 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 972 } 973 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 974 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 975 } catch(char *ex) { 976 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 977 } 978 } 979 PetscFunctionReturn(0); 980 } 981 982 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 983 { 984 PetscErrorCode ierr; 985 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 986 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 987 IS ip = a->row; 988 PetscBool perm_identity; 989 PetscInt n = A->rmap->n; 990 991 PetscFunctionBegin; 992 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 993 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 994 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 995 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 996 997 A->offloadmask = PETSC_OFFLOAD_BOTH; 998 999 /* lower triangular indices */ 1000 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1001 if (!perm_identity) { 1002 IS iip; 1003 const PetscInt *irip,*rip; 1004 1005 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 1006 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1007 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1008 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1009 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1010 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1011 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 1012 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 1013 ierr = ISDestroy(&iip);CHKERRQ(ierr); 1014 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1015 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1016 } 1017 PetscFunctionReturn(0); 1018 } 1019 1020 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1021 { 1022 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1023 IS ip = b->row; 1024 PetscBool perm_identity; 1025 PetscErrorCode ierr; 1026 1027 PetscFunctionBegin; 1028 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1029 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1030 B->offloadmask = PETSC_OFFLOAD_CPU; 1031 /* determine which version of MatSolve needs to be used. */ 1032 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1033 if (perm_identity) { 1034 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1035 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1036 B->ops->matsolve = NULL; 1037 B->ops->matsolvetranspose = NULL; 1038 } else { 1039 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1040 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1041 B->ops->matsolve = NULL; 1042 B->ops->matsolvetranspose = NULL; 1043 } 1044 1045 /* get the triangular factors */ 1046 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1047 PetscFunctionReturn(0); 1048 } 1049 1050 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1051 { 1052 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1054 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1055 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1056 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1057 cusparseStatus_t stat; 1058 cusparseIndexBase_t indexBase; 1059 cusparseMatrixType_t matrixType; 1060 cusparseFillMode_t fillMode; 1061 cusparseDiagType_t diagType; 1062 cudaError_t cerr; 1063 PetscErrorCode ierr; 1064 1065 PetscFunctionBegin; 1066 /* allocate space for the transpose of the lower triangular factor */ 1067 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1068 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1069 1070 /* set the matrix descriptors of the lower triangular factor */ 1071 matrixType = cusparseGetMatType(loTriFactor->descr); 1072 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1073 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1074 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1075 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1076 1077 /* Create the matrix description */ 1078 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1079 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1080 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1081 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1082 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1083 1084 /* set the operation */ 1085 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1086 1087 /* allocate GPU space for the CSC of the lower triangular factor*/ 1088 loTriFactorT->csrMat = new CsrMatrix; 1089 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1090 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1091 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1092 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1093 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1094 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1095 1096 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1097 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1098 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1099 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1100 loTriFactor->csrMat->values->data().get(), 1101 loTriFactor->csrMat->row_offsets->data().get(), 1102 loTriFactor->csrMat->column_indices->data().get(), 1103 loTriFactorT->csrMat->values->data().get(), 1104 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1105 CUSPARSE_ACTION_NUMERIC,indexBase, 1106 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1107 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1108 #endif 1109 1110 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1111 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1112 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1113 loTriFactor->csrMat->values->data().get(), 1114 loTriFactor->csrMat->row_offsets->data().get(), 1115 loTriFactor->csrMat->column_indices->data().get(), 1116 loTriFactorT->csrMat->values->data().get(), 1117 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1118 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1119 CUSPARSE_ACTION_NUMERIC, indexBase, 1120 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1121 #else 1122 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1123 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1124 #endif 1125 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1126 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1127 1128 /* Create the solve analysis information */ 1129 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1130 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1131 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1132 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1133 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1134 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1135 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1136 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1137 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1138 #endif 1139 1140 /* perform the solve analysis */ 1141 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1142 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1143 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1144 loTriFactorT->csrMat->column_indices->data().get(), 1145 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1146 loTriFactorT->solveInfo, 1147 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1148 #else 1149 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1150 #endif 1151 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1152 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1153 1154 /* assign the pointer */ 1155 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1156 1157 /*********************************************/ 1158 /* Now the Transpose of the Upper Tri Factor */ 1159 /*********************************************/ 1160 1161 /* allocate space for the transpose of the upper triangular factor */ 1162 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1163 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1164 1165 /* set the matrix descriptors of the upper triangular factor */ 1166 matrixType = cusparseGetMatType(upTriFactor->descr); 1167 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1168 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1169 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1170 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1171 1172 /* Create the matrix description */ 1173 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1174 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1175 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1176 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1177 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1178 1179 /* set the operation */ 1180 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1181 1182 /* allocate GPU space for the CSC of the upper triangular factor*/ 1183 upTriFactorT->csrMat = new CsrMatrix; 1184 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1185 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1186 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1187 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1188 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1189 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1190 1191 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1192 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1193 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1194 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1195 upTriFactor->csrMat->values->data().get(), 1196 upTriFactor->csrMat->row_offsets->data().get(), 1197 upTriFactor->csrMat->column_indices->data().get(), 1198 upTriFactorT->csrMat->values->data().get(), 1199 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1200 CUSPARSE_ACTION_NUMERIC,indexBase, 1201 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1202 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1203 #endif 1204 1205 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1206 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1207 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1208 upTriFactor->csrMat->values->data().get(), 1209 upTriFactor->csrMat->row_offsets->data().get(), 1210 upTriFactor->csrMat->column_indices->data().get(), 1211 upTriFactorT->csrMat->values->data().get(), 1212 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1213 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1214 CUSPARSE_ACTION_NUMERIC, indexBase, 1215 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1216 #else 1217 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1218 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1219 #endif 1220 1221 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1222 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1223 1224 /* Create the solve analysis information */ 1225 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1226 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1227 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1228 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1229 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1230 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1231 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1232 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1233 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1234 #endif 1235 1236 /* perform the solve analysis */ 1237 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1238 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1239 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1240 upTriFactorT->csrMat->column_indices->data().get(), 1241 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1242 upTriFactorT->solveInfo, 1243 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1244 #else 1245 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1246 #endif 1247 1248 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1249 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1250 1251 /* assign the pointer */ 1252 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1253 PetscFunctionReturn(0); 1254 } 1255 1256 struct PetscScalarToPetscInt 1257 { 1258 __host__ __device__ 1259 PetscInt operator()(PetscScalar s) 1260 { 1261 return (PetscInt)PetscRealPart(s); 1262 } 1263 }; 1264 1265 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1266 { 1267 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1268 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1269 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1270 cusparseStatus_t stat; 1271 cusparseIndexBase_t indexBase; 1272 cudaError_t err; 1273 PetscErrorCode ierr; 1274 1275 PetscFunctionBegin; 1276 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1277 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1278 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1279 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1280 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1281 if (A->transupdated) PetscFunctionReturn(0); 1282 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1283 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1284 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1285 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1286 } 1287 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1288 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1289 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1290 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1291 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1292 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1293 1294 /* set alpha and beta */ 1295 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1296 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1297 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1298 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1299 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1300 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1301 1302 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1303 CsrMatrix *matrixT = new CsrMatrix; 1304 matstructT->mat = matrixT; 1305 matrixT->num_rows = A->cmap->n; 1306 matrixT->num_cols = A->rmap->n; 1307 matrixT->num_entries = a->nz; 1308 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1309 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1310 matrixT->values = new THRUSTARRAY(a->nz); 1311 1312 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1313 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1314 1315 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1316 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1317 stat = cusparseCreateCsr(&matstructT->matDescr, 1318 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1319 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1320 matrixT->values->data().get(), 1321 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1322 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1323 #else 1324 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1325 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1326 1327 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1328 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1329 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1330 */ 1331 if (matrixT->num_entries) { 1332 stat = cusparseCreateCsr(&matstructT->matDescr, 1333 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1334 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1335 matrixT->values->data().get(), 1336 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1337 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1338 1339 } else { 1340 matstructT->matDescr = NULL; 1341 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1342 } 1343 #endif 1344 #endif 1345 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1346 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1347 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1348 #else 1349 CsrMatrix *temp = new CsrMatrix; 1350 CsrMatrix *tempT = new CsrMatrix; 1351 /* First convert HYB to CSR */ 1352 temp->num_rows = A->rmap->n; 1353 temp->num_cols = A->cmap->n; 1354 temp->num_entries = a->nz; 1355 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1356 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1357 temp->values = new THRUSTARRAY(a->nz); 1358 1359 stat = cusparse_hyb2csr(cusparsestruct->handle, 1360 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1361 temp->values->data().get(), 1362 temp->row_offsets->data().get(), 1363 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1364 1365 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1366 tempT->num_rows = A->rmap->n; 1367 tempT->num_cols = A->cmap->n; 1368 tempT->num_entries = a->nz; 1369 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1370 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1371 tempT->values = new THRUSTARRAY(a->nz); 1372 1373 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1374 temp->num_cols, temp->num_entries, 1375 temp->values->data().get(), 1376 temp->row_offsets->data().get(), 1377 temp->column_indices->data().get(), 1378 tempT->values->data().get(), 1379 tempT->column_indices->data().get(), 1380 tempT->row_offsets->data().get(), 1381 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1382 1383 /* Last, convert CSC to HYB */ 1384 cusparseHybMat_t hybMat; 1385 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1386 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1387 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1388 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1389 matstructT->descr, tempT->values->data().get(), 1390 tempT->row_offsets->data().get(), 1391 tempT->column_indices->data().get(), 1392 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1393 1394 /* assign the pointer */ 1395 matstructT->mat = hybMat; 1396 A->transupdated = PETSC_TRUE; 1397 /* delete temporaries */ 1398 if (tempT) { 1399 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1400 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1401 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1402 delete (CsrMatrix*) tempT; 1403 } 1404 if (temp) { 1405 if (temp->values) delete (THRUSTARRAY*) temp->values; 1406 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1407 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1408 delete (CsrMatrix*) temp; 1409 } 1410 #endif 1411 } 1412 } 1413 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1414 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1415 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1416 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1417 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1418 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1419 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1420 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1421 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1422 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1423 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1424 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1425 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1426 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1427 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1428 } 1429 if (!cusparsestruct->csr2csc_i) { 1430 THRUSTARRAY csr2csc_a(matrix->num_entries); 1431 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1432 1433 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1434 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1435 void *csr2cscBuffer; 1436 size_t csr2cscBufferSize; 1437 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1438 A->cmap->n, matrix->num_entries, 1439 matrix->values->data().get(), 1440 cusparsestruct->rowoffsets_gpu->data().get(), 1441 matrix->column_indices->data().get(), 1442 matrixT->values->data().get(), 1443 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1444 CUSPARSE_ACTION_NUMERIC,indexBase, 1445 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1446 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1447 #endif 1448 1449 if (matrix->num_entries) { 1450 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1451 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1452 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1453 1454 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1455 should be filled with indexBase. So I just take a shortcut here. 1456 */ 1457 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1458 A->cmap->n,matrix->num_entries, 1459 csr2csc_a.data().get(), 1460 cusparsestruct->rowoffsets_gpu->data().get(), 1461 matrix->column_indices->data().get(), 1462 matrixT->values->data().get(), 1463 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1464 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1465 CUSPARSE_ACTION_NUMERIC,indexBase, 1466 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1467 #else 1468 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1469 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1470 #endif 1471 } else { 1472 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1473 } 1474 1475 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1476 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1477 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1478 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1479 #endif 1480 } 1481 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1482 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1483 matrixT->values->begin())); 1484 } 1485 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1486 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1487 /* the compressed row indices is not used for matTranspose */ 1488 matstructT->cprowIndices = NULL; 1489 /* assign the pointer */ 1490 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1491 A->transupdated = PETSC_TRUE; 1492 PetscFunctionReturn(0); 1493 } 1494 1495 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1496 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1497 { 1498 PetscInt n = xx->map->n; 1499 const PetscScalar *barray; 1500 PetscScalar *xarray; 1501 thrust::device_ptr<const PetscScalar> bGPU; 1502 thrust::device_ptr<PetscScalar> xGPU; 1503 cusparseStatus_t stat; 1504 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1505 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1506 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1507 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1508 PetscErrorCode ierr; 1509 1510 PetscFunctionBegin; 1511 /* Analyze the matrix and create the transpose ... on the fly */ 1512 if (!loTriFactorT && !upTriFactorT) { 1513 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1514 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1515 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1516 } 1517 1518 /* Get the GPU pointers */ 1519 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1520 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1521 xGPU = thrust::device_pointer_cast(xarray); 1522 bGPU = thrust::device_pointer_cast(barray); 1523 1524 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1525 /* First, reorder with the row permutation */ 1526 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1527 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1528 xGPU); 1529 1530 /* First, solve U */ 1531 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1532 upTriFactorT->csrMat->num_rows, 1533 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1534 upTriFactorT->csrMat->num_entries, 1535 #endif 1536 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1537 upTriFactorT->csrMat->values->data().get(), 1538 upTriFactorT->csrMat->row_offsets->data().get(), 1539 upTriFactorT->csrMat->column_indices->data().get(), 1540 upTriFactorT->solveInfo, 1541 xarray, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 tempGPU->data().get(), 1544 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1545 #else 1546 tempGPU->data().get());CHKERRCUSPARSE(stat); 1547 #endif 1548 1549 /* Then, solve L */ 1550 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1551 loTriFactorT->csrMat->num_rows, 1552 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1553 loTriFactorT->csrMat->num_entries, 1554 #endif 1555 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1556 loTriFactorT->csrMat->values->data().get(), 1557 loTriFactorT->csrMat->row_offsets->data().get(), 1558 loTriFactorT->csrMat->column_indices->data().get(), 1559 loTriFactorT->solveInfo, 1560 tempGPU->data().get(), 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 xarray, 1563 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1564 #else 1565 xarray);CHKERRCUSPARSE(stat); 1566 #endif 1567 1568 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1569 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1570 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1571 tempGPU->begin()); 1572 1573 /* Copy the temporary to the full solution. */ 1574 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1575 1576 /* restore */ 1577 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1578 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1579 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1580 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1581 PetscFunctionReturn(0); 1582 } 1583 1584 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1585 { 1586 const PetscScalar *barray; 1587 PetscScalar *xarray; 1588 cusparseStatus_t stat; 1589 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1590 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1591 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1592 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1593 PetscErrorCode ierr; 1594 1595 PetscFunctionBegin; 1596 /* Analyze the matrix and create the transpose ... on the fly */ 1597 if (!loTriFactorT && !upTriFactorT) { 1598 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1599 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1600 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1601 } 1602 1603 /* Get the GPU pointers */ 1604 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1605 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1606 1607 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1608 /* First, solve U */ 1609 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1610 upTriFactorT->csrMat->num_rows, 1611 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1612 upTriFactorT->csrMat->num_entries, 1613 #endif 1614 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1615 upTriFactorT->csrMat->values->data().get(), 1616 upTriFactorT->csrMat->row_offsets->data().get(), 1617 upTriFactorT->csrMat->column_indices->data().get(), 1618 upTriFactorT->solveInfo, 1619 barray, 1620 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1621 tempGPU->data().get(), 1622 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1623 #else 1624 tempGPU->data().get());CHKERRCUSPARSE(stat); 1625 #endif 1626 1627 /* Then, solve L */ 1628 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1629 loTriFactorT->csrMat->num_rows, 1630 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631 loTriFactorT->csrMat->num_entries, 1632 #endif 1633 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1634 loTriFactorT->csrMat->values->data().get(), 1635 loTriFactorT->csrMat->row_offsets->data().get(), 1636 loTriFactorT->csrMat->column_indices->data().get(), 1637 loTriFactorT->solveInfo, 1638 tempGPU->data().get(), 1639 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1640 xarray, 1641 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1642 #else 1643 xarray);CHKERRCUSPARSE(stat); 1644 #endif 1645 1646 /* restore */ 1647 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1648 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1649 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1650 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1651 PetscFunctionReturn(0); 1652 } 1653 1654 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1655 { 1656 const PetscScalar *barray; 1657 PetscScalar *xarray; 1658 thrust::device_ptr<const PetscScalar> bGPU; 1659 thrust::device_ptr<PetscScalar> xGPU; 1660 cusparseStatus_t stat; 1661 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1662 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1663 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1664 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1665 PetscErrorCode ierr; 1666 1667 PetscFunctionBegin; 1668 1669 /* Get the GPU pointers */ 1670 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1671 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1672 xGPU = thrust::device_pointer_cast(xarray); 1673 bGPU = thrust::device_pointer_cast(barray); 1674 1675 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1676 /* First, reorder with the row permutation */ 1677 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1678 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1679 tempGPU->begin()); 1680 1681 /* Next, solve L */ 1682 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1683 loTriFactor->csrMat->num_rows, 1684 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1685 loTriFactor->csrMat->num_entries, 1686 #endif 1687 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1688 loTriFactor->csrMat->values->data().get(), 1689 loTriFactor->csrMat->row_offsets->data().get(), 1690 loTriFactor->csrMat->column_indices->data().get(), 1691 loTriFactor->solveInfo, 1692 tempGPU->data().get(), 1693 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1694 xarray, 1695 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1696 #else 1697 xarray);CHKERRCUSPARSE(stat); 1698 #endif 1699 1700 /* Then, solve U */ 1701 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1702 upTriFactor->csrMat->num_rows, 1703 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1704 upTriFactor->csrMat->num_entries, 1705 #endif 1706 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1707 upTriFactor->csrMat->values->data().get(), 1708 upTriFactor->csrMat->row_offsets->data().get(), 1709 upTriFactor->csrMat->column_indices->data().get(), 1710 upTriFactor->solveInfo,xarray, 1711 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1712 tempGPU->data().get(), 1713 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1714 #else 1715 tempGPU->data().get());CHKERRCUSPARSE(stat); 1716 #endif 1717 1718 /* Last, reorder with the column permutation */ 1719 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1720 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1721 xGPU); 1722 1723 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1724 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1725 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1726 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1727 PetscFunctionReturn(0); 1728 } 1729 1730 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1731 { 1732 const PetscScalar *barray; 1733 PetscScalar *xarray; 1734 cusparseStatus_t stat; 1735 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1736 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1737 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1738 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1739 PetscErrorCode ierr; 1740 1741 PetscFunctionBegin; 1742 /* Get the GPU pointers */ 1743 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1744 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1745 1746 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1747 /* First, solve L */ 1748 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1749 loTriFactor->csrMat->num_rows, 1750 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1751 loTriFactor->csrMat->num_entries, 1752 #endif 1753 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1754 loTriFactor->csrMat->values->data().get(), 1755 loTriFactor->csrMat->row_offsets->data().get(), 1756 loTriFactor->csrMat->column_indices->data().get(), 1757 loTriFactor->solveInfo, 1758 barray, 1759 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1760 tempGPU->data().get(), 1761 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1762 #else 1763 tempGPU->data().get());CHKERRCUSPARSE(stat); 1764 #endif 1765 1766 /* Next, solve U */ 1767 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1768 upTriFactor->csrMat->num_rows, 1769 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1770 upTriFactor->csrMat->num_entries, 1771 #endif 1772 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1773 upTriFactor->csrMat->values->data().get(), 1774 upTriFactor->csrMat->row_offsets->data().get(), 1775 upTriFactor->csrMat->column_indices->data().get(), 1776 upTriFactor->solveInfo, 1777 tempGPU->data().get(), 1778 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1779 xarray, 1780 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1781 #else 1782 xarray);CHKERRCUSPARSE(stat); 1783 #endif 1784 1785 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1786 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1787 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1788 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1789 PetscFunctionReturn(0); 1790 } 1791 1792 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1793 { 1794 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1795 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1796 cudaError_t cerr; 1797 PetscErrorCode ierr; 1798 1799 PetscFunctionBegin; 1800 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1801 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1802 1803 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1804 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1805 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1806 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1807 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1808 A->offloadmask = PETSC_OFFLOAD_BOTH; 1809 } 1810 PetscFunctionReturn(0); 1811 } 1812 1813 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1814 { 1815 PetscErrorCode ierr; 1816 1817 PetscFunctionBegin; 1818 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1819 *array = ((Mat_SeqAIJ*)A->data)->a; 1820 PetscFunctionReturn(0); 1821 } 1822 1823 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1824 { 1825 PetscFunctionBegin; 1826 A->offloadmask = PETSC_OFFLOAD_CPU; 1827 *array = NULL; 1828 PetscFunctionReturn(0); 1829 } 1830 1831 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1832 { 1833 PetscErrorCode ierr; 1834 1835 PetscFunctionBegin; 1836 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1837 *array = ((Mat_SeqAIJ*)A->data)->a; 1838 PetscFunctionReturn(0); 1839 } 1840 1841 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1842 { 1843 PetscFunctionBegin; 1844 *array = NULL; 1845 PetscFunctionReturn(0); 1846 } 1847 1848 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1849 { 1850 PetscFunctionBegin; 1851 *array = ((Mat_SeqAIJ*)A->data)->a; 1852 PetscFunctionReturn(0); 1853 } 1854 1855 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1856 { 1857 PetscFunctionBegin; 1858 A->offloadmask = PETSC_OFFLOAD_CPU; 1859 *array = NULL; 1860 PetscFunctionReturn(0); 1861 } 1862 1863 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1864 { 1865 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1866 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1867 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1868 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1869 PetscErrorCode ierr; 1870 cusparseStatus_t stat; 1871 PetscBool both = PETSC_TRUE; 1872 cudaError_t err; 1873 1874 PetscFunctionBegin; 1875 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1876 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1877 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1878 CsrMatrix *matrix; 1879 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1880 1881 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1882 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1883 matrix->values->assign(a->a, a->a+a->nz); 1884 err = WaitForCUDA();CHKERRCUDA(err); 1885 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1886 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1887 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1888 } else { 1889 PetscInt nnz; 1890 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1891 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1892 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1893 delete cusparsestruct->workVector; 1894 delete cusparsestruct->rowoffsets_gpu; 1895 cusparsestruct->workVector = NULL; 1896 cusparsestruct->rowoffsets_gpu = NULL; 1897 try { 1898 if (a->compressedrow.use) { 1899 m = a->compressedrow.nrows; 1900 ii = a->compressedrow.i; 1901 ridx = a->compressedrow.rindex; 1902 } else { 1903 m = A->rmap->n; 1904 ii = a->i; 1905 ridx = NULL; 1906 } 1907 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1908 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1909 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1910 else nnz = a->nz; 1911 1912 /* create cusparse matrix */ 1913 cusparsestruct->nrows = m; 1914 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1915 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1916 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1917 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1918 1919 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1920 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1921 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1922 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1923 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1924 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1925 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1926 1927 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1928 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1929 /* set the matrix */ 1930 CsrMatrix *mat= new CsrMatrix; 1931 mat->num_rows = m; 1932 mat->num_cols = A->cmap->n; 1933 mat->num_entries = nnz; 1934 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1935 mat->row_offsets->assign(ii, ii + m+1); 1936 1937 mat->column_indices = new THRUSTINTARRAY32(nnz); 1938 mat->column_indices->assign(a->j, a->j+nnz); 1939 1940 mat->values = new THRUSTARRAY(nnz); 1941 if (a->a) mat->values->assign(a->a, a->a+nnz); 1942 1943 /* assign the pointer */ 1944 matstruct->mat = mat; 1945 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1946 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1947 stat = cusparseCreateCsr(&matstruct->matDescr, 1948 mat->num_rows, mat->num_cols, mat->num_entries, 1949 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1950 mat->values->data().get(), 1951 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1952 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1953 } 1954 #endif 1955 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1956 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1957 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1958 #else 1959 CsrMatrix *mat= new CsrMatrix; 1960 mat->num_rows = m; 1961 mat->num_cols = A->cmap->n; 1962 mat->num_entries = nnz; 1963 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1964 mat->row_offsets->assign(ii, ii + m+1); 1965 1966 mat->column_indices = new THRUSTINTARRAY32(nnz); 1967 mat->column_indices->assign(a->j, a->j+nnz); 1968 1969 mat->values = new THRUSTARRAY(nnz); 1970 if (a->a) mat->values->assign(a->a, a->a+nnz); 1971 1972 cusparseHybMat_t hybMat; 1973 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1974 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1975 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1976 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1977 matstruct->descr, mat->values->data().get(), 1978 mat->row_offsets->data().get(), 1979 mat->column_indices->data().get(), 1980 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1981 /* assign the pointer */ 1982 matstruct->mat = hybMat; 1983 1984 if (mat) { 1985 if (mat->values) delete (THRUSTARRAY*)mat->values; 1986 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1987 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1988 delete (CsrMatrix*)mat; 1989 } 1990 #endif 1991 } 1992 1993 /* assign the compressed row indices */ 1994 if (a->compressedrow.use) { 1995 cusparsestruct->workVector = new THRUSTARRAY(m); 1996 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1997 matstruct->cprowIndices->assign(ridx,ridx+m); 1998 tmp = m; 1999 } else { 2000 cusparsestruct->workVector = NULL; 2001 matstruct->cprowIndices = NULL; 2002 tmp = 0; 2003 } 2004 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2005 2006 /* assign the pointer */ 2007 cusparsestruct->mat = matstruct; 2008 } catch(char *ex) { 2009 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2010 } 2011 err = WaitForCUDA();CHKERRCUDA(err); 2012 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 2013 cusparsestruct->nonzerostate = A->nonzerostate; 2014 } 2015 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2016 } 2017 PetscFunctionReturn(0); 2018 } 2019 2020 struct VecCUDAPlusEquals 2021 { 2022 template <typename Tuple> 2023 __host__ __device__ 2024 void operator()(Tuple t) 2025 { 2026 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2027 } 2028 }; 2029 2030 struct VecCUDAEquals 2031 { 2032 template <typename Tuple> 2033 __host__ __device__ 2034 void operator()(Tuple t) 2035 { 2036 thrust::get<1>(t) = thrust::get<0>(t); 2037 } 2038 }; 2039 2040 struct VecCUDAEqualsReverse 2041 { 2042 template <typename Tuple> 2043 __host__ __device__ 2044 void operator()(Tuple t) 2045 { 2046 thrust::get<0>(t) = thrust::get<1>(t); 2047 } 2048 }; 2049 2050 struct MatMatCusparse { 2051 PetscBool cisdense; 2052 PetscScalar *Bt; 2053 Mat X; 2054 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2055 PetscLogDouble flops; 2056 CsrMatrix *Bcsr; 2057 2058 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2059 cusparseSpMatDescr_t matSpBDescr; 2060 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2061 cusparseDnMatDescr_t matBDescr; 2062 cusparseDnMatDescr_t matCDescr; 2063 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2064 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2065 void *dBuffer4; 2066 void *dBuffer5; 2067 #endif 2068 size_t mmBufferSize; 2069 void *mmBuffer; 2070 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2071 cusparseSpGEMMDescr_t spgemmDesc; 2072 #endif 2073 }; 2074 2075 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2076 { 2077 PetscErrorCode ierr; 2078 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2079 cudaError_t cerr; 2080 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2081 cusparseStatus_t stat; 2082 #endif 2083 2084 PetscFunctionBegin; 2085 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2086 delete mmdata->Bcsr; 2087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2088 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2089 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2090 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2091 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2092 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2093 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2094 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2095 #endif 2096 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2097 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2098 #endif 2099 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2100 ierr = PetscFree(data);CHKERRQ(ierr); 2101 PetscFunctionReturn(0); 2102 } 2103 2104 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2105 2106 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2107 { 2108 Mat_Product *product = C->product; 2109 Mat A,B; 2110 PetscInt m,n,blda,clda; 2111 PetscBool flg,biscuda; 2112 Mat_SeqAIJCUSPARSE *cusp; 2113 cusparseStatus_t stat; 2114 cusparseOperation_t opA; 2115 const PetscScalar *barray; 2116 PetscScalar *carray; 2117 PetscErrorCode ierr; 2118 MatMatCusparse *mmdata; 2119 Mat_SeqAIJCUSPARSEMultStruct *mat; 2120 CsrMatrix *csrmat; 2121 2122 PetscFunctionBegin; 2123 MatCheckProduct(C,1); 2124 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2125 mmdata = (MatMatCusparse*)product->data; 2126 A = product->A; 2127 B = product->B; 2128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2129 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2130 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2131 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2132 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2133 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2134 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2135 switch (product->type) { 2136 case MATPRODUCT_AB: 2137 case MATPRODUCT_PtAP: 2138 mat = cusp->mat; 2139 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2140 m = A->rmap->n; 2141 n = B->cmap->n; 2142 break; 2143 case MATPRODUCT_AtB: 2144 if (!A->form_explicit_transpose) { 2145 mat = cusp->mat; 2146 opA = CUSPARSE_OPERATION_TRANSPOSE; 2147 } else { 2148 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2149 mat = cusp->matTranspose; 2150 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2151 } 2152 m = A->cmap->n; 2153 n = B->cmap->n; 2154 break; 2155 case MATPRODUCT_ABt: 2156 case MATPRODUCT_RARt: 2157 mat = cusp->mat; 2158 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2159 m = A->rmap->n; 2160 n = B->rmap->n; 2161 break; 2162 default: 2163 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2164 } 2165 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2166 csrmat = (CsrMatrix*)mat->mat; 2167 /* if the user passed a CPU matrix, copy the data to the GPU */ 2168 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2169 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2170 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2171 2172 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2173 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2174 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2175 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2176 } else { 2177 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2178 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2179 } 2180 2181 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2182 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2183 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2184 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2185 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2186 size_t mmBufferSize; 2187 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2188 if (!mmdata->matBDescr) { 2189 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2190 mmdata->Blda = blda; 2191 } 2192 2193 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2194 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2195 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2196 mmdata->Clda = clda; 2197 } 2198 2199 if (!mat->matDescr) { 2200 stat = cusparseCreateCsr(&mat->matDescr, 2201 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2202 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2203 csrmat->values->data().get(), 2204 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2205 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2206 } 2207 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2208 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2209 mmdata->matCDescr,cusparse_scalartype, 2210 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2211 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2212 cudaError_t cerr; 2213 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2214 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2215 mmdata->mmBufferSize = mmBufferSize; 2216 } 2217 mmdata->initialized = PETSC_TRUE; 2218 } else { 2219 /* to be safe, always update pointers of the mats */ 2220 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2221 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2222 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2223 } 2224 2225 /* do cusparseSpMM, which supports transpose on B */ 2226 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2227 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2228 mmdata->matCDescr,cusparse_scalartype, 2229 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2230 #else 2231 PetscInt k; 2232 /* cusparseXcsrmm does not support transpose on B */ 2233 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2234 cublasHandle_t cublasv2handle; 2235 cublasStatus_t cerr; 2236 2237 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2238 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2239 B->cmap->n,B->rmap->n, 2240 &PETSC_CUSPARSE_ONE ,barray,blda, 2241 &PETSC_CUSPARSE_ZERO,barray,blda, 2242 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2243 blda = B->cmap->n; 2244 k = B->cmap->n; 2245 } else { 2246 k = B->rmap->n; 2247 } 2248 2249 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2250 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2251 csrmat->num_entries,mat->alpha_one,mat->descr, 2252 csrmat->values->data().get(), 2253 csrmat->row_offsets->data().get(), 2254 csrmat->column_indices->data().get(), 2255 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2256 carray,clda);CHKERRCUSPARSE(stat); 2257 #endif 2258 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2259 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2260 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2261 if (product->type == MATPRODUCT_RARt) { 2262 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2263 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2264 } else if (product->type == MATPRODUCT_PtAP) { 2265 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2266 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2267 } else { 2268 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2269 } 2270 if (mmdata->cisdense) { 2271 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2272 } 2273 if (!biscuda) { 2274 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2275 } 2276 PetscFunctionReturn(0); 2277 } 2278 2279 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2280 { 2281 Mat_Product *product = C->product; 2282 Mat A,B; 2283 PetscInt m,n; 2284 PetscBool cisdense,flg; 2285 PetscErrorCode ierr; 2286 MatMatCusparse *mmdata; 2287 Mat_SeqAIJCUSPARSE *cusp; 2288 2289 PetscFunctionBegin; 2290 MatCheckProduct(C,1); 2291 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2292 A = product->A; 2293 B = product->B; 2294 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2295 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2296 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2297 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2298 switch (product->type) { 2299 case MATPRODUCT_AB: 2300 m = A->rmap->n; 2301 n = B->cmap->n; 2302 break; 2303 case MATPRODUCT_AtB: 2304 m = A->cmap->n; 2305 n = B->cmap->n; 2306 break; 2307 case MATPRODUCT_ABt: 2308 m = A->rmap->n; 2309 n = B->rmap->n; 2310 break; 2311 case MATPRODUCT_PtAP: 2312 m = B->cmap->n; 2313 n = B->cmap->n; 2314 break; 2315 case MATPRODUCT_RARt: 2316 m = B->rmap->n; 2317 n = B->rmap->n; 2318 break; 2319 default: 2320 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2321 } 2322 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2323 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2324 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2325 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2326 2327 /* product data */ 2328 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2329 mmdata->cisdense = cisdense; 2330 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2331 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2332 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2333 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2334 } 2335 #endif 2336 /* for these products we need intermediate storage */ 2337 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2338 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2339 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2340 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2341 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2342 } else { 2343 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2344 } 2345 } 2346 C->product->data = mmdata; 2347 C->product->destroy = MatDestroy_MatMatCusparse; 2348 2349 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2350 PetscFunctionReturn(0); 2351 } 2352 2353 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2354 { 2355 Mat_Product *product = C->product; 2356 Mat A,B; 2357 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2358 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2359 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2360 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2361 PetscBool flg; 2362 PetscErrorCode ierr; 2363 cusparseStatus_t stat; 2364 cudaError_t cerr; 2365 MatProductType ptype; 2366 MatMatCusparse *mmdata; 2367 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2368 cusparseSpMatDescr_t BmatSpDescr; 2369 #endif 2370 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2371 2372 PetscFunctionBegin; 2373 MatCheckProduct(C,1); 2374 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2375 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2376 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2377 mmdata = (MatMatCusparse*)C->product->data; 2378 A = product->A; 2379 B = product->B; 2380 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2381 mmdata->reusesym = PETSC_FALSE; 2382 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2383 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2384 Cmat = Ccusp->mat; 2385 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2386 Ccsr = (CsrMatrix*)Cmat->mat; 2387 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2388 goto finalize; 2389 } 2390 if (!c->nz) goto finalize; 2391 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2392 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2393 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2394 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2395 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2396 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2397 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2398 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2399 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2400 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2401 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2402 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2403 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2404 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2405 2406 ptype = product->type; 2407 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2408 ptype = MATPRODUCT_AB; 2409 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2410 } 2411 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2412 ptype = MATPRODUCT_AB; 2413 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2414 } 2415 switch (ptype) { 2416 case MATPRODUCT_AB: 2417 Amat = Acusp->mat; 2418 Bmat = Bcusp->mat; 2419 break; 2420 case MATPRODUCT_AtB: 2421 Amat = Acusp->matTranspose; 2422 Bmat = Bcusp->mat; 2423 break; 2424 case MATPRODUCT_ABt: 2425 Amat = Acusp->mat; 2426 Bmat = Bcusp->matTranspose; 2427 break; 2428 default: 2429 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2430 } 2431 Cmat = Ccusp->mat; 2432 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2433 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2434 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2435 Acsr = (CsrMatrix*)Amat->mat; 2436 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2437 Ccsr = (CsrMatrix*)Cmat->mat; 2438 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2439 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2440 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2441 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2442 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2443 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2444 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2445 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2446 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2447 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2448 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2449 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2450 #else 2451 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2452 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2453 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2454 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2455 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2456 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2457 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2458 #endif 2459 #else 2460 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2461 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2462 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2463 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2464 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2465 #endif 2466 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2467 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2468 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2469 C->offloadmask = PETSC_OFFLOAD_GPU; 2470 finalize: 2471 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2472 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2473 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2474 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2475 c->reallocs = 0; 2476 C->info.mallocs += 0; 2477 C->info.nz_unneeded = 0; 2478 C->assembled = C->was_assembled = PETSC_TRUE; 2479 C->num_ass++; 2480 PetscFunctionReturn(0); 2481 } 2482 2483 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2484 { 2485 Mat_Product *product = C->product; 2486 Mat A,B; 2487 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2488 Mat_SeqAIJ *a,*b,*c; 2489 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2490 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2491 PetscInt i,j,m,n,k; 2492 PetscBool flg; 2493 PetscErrorCode ierr; 2494 cusparseStatus_t stat; 2495 cudaError_t cerr; 2496 MatProductType ptype; 2497 MatMatCusparse *mmdata; 2498 PetscLogDouble flops; 2499 PetscBool biscompressed,ciscompressed; 2500 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2501 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2502 cusparseSpMatDescr_t BmatSpDescr; 2503 #else 2504 int cnz; 2505 #endif 2506 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2507 2508 PetscFunctionBegin; 2509 MatCheckProduct(C,1); 2510 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2511 A = product->A; 2512 B = product->B; 2513 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2514 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2515 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2516 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2517 a = (Mat_SeqAIJ*)A->data; 2518 b = (Mat_SeqAIJ*)B->data; 2519 /* product data */ 2520 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2521 C->product->data = mmdata; 2522 C->product->destroy = MatDestroy_MatMatCusparse; 2523 2524 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2525 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2526 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2527 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2528 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2529 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2530 2531 ptype = product->type; 2532 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2533 ptype = MATPRODUCT_AB; 2534 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2535 } 2536 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2537 ptype = MATPRODUCT_AB; 2538 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2539 } 2540 biscompressed = PETSC_FALSE; 2541 ciscompressed = PETSC_FALSE; 2542 switch (ptype) { 2543 case MATPRODUCT_AB: 2544 m = A->rmap->n; 2545 n = B->cmap->n; 2546 k = A->cmap->n; 2547 Amat = Acusp->mat; 2548 Bmat = Bcusp->mat; 2549 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2550 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2551 break; 2552 case MATPRODUCT_AtB: 2553 m = A->cmap->n; 2554 n = B->cmap->n; 2555 k = A->rmap->n; 2556 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2557 Amat = Acusp->matTranspose; 2558 Bmat = Bcusp->mat; 2559 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2560 break; 2561 case MATPRODUCT_ABt: 2562 m = A->rmap->n; 2563 n = B->rmap->n; 2564 k = A->cmap->n; 2565 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2566 Amat = Acusp->mat; 2567 Bmat = Bcusp->matTranspose; 2568 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2569 break; 2570 default: 2571 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2572 } 2573 2574 /* create cusparse matrix */ 2575 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2576 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2577 c = (Mat_SeqAIJ*)C->data; 2578 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2579 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2580 Ccsr = new CsrMatrix; 2581 2582 c->compressedrow.use = ciscompressed; 2583 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2584 c->compressedrow.nrows = a->compressedrow.nrows; 2585 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2586 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2587 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2588 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2589 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2590 } else { 2591 c->compressedrow.nrows = 0; 2592 c->compressedrow.i = NULL; 2593 c->compressedrow.rindex = NULL; 2594 Ccusp->workVector = NULL; 2595 Cmat->cprowIndices = NULL; 2596 } 2597 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2598 Ccusp->mat = Cmat; 2599 Ccusp->mat->mat = Ccsr; 2600 Ccsr->num_rows = Ccusp->nrows; 2601 Ccsr->num_cols = n; 2602 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2603 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2604 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2605 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2606 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2607 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2608 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2609 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2610 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2611 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2612 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2613 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2614 c->nz = 0; 2615 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2616 Ccsr->values = new THRUSTARRAY(c->nz); 2617 goto finalizesym; 2618 } 2619 2620 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2621 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2622 Acsr = (CsrMatrix*)Amat->mat; 2623 if (!biscompressed) { 2624 Bcsr = (CsrMatrix*)Bmat->mat; 2625 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2626 BmatSpDescr = Bmat->matDescr; 2627 #endif 2628 } else { /* we need to use row offsets for the full matrix */ 2629 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2630 Bcsr = new CsrMatrix; 2631 Bcsr->num_rows = B->rmap->n; 2632 Bcsr->num_cols = cBcsr->num_cols; 2633 Bcsr->num_entries = cBcsr->num_entries; 2634 Bcsr->column_indices = cBcsr->column_indices; 2635 Bcsr->values = cBcsr->values; 2636 if (!Bcusp->rowoffsets_gpu) { 2637 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2638 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2639 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2640 } 2641 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2642 mmdata->Bcsr = Bcsr; 2643 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2644 if (Bcsr->num_rows && Bcsr->num_cols) { 2645 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2646 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2647 Bcsr->values->data().get(), 2648 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2649 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2650 } 2651 BmatSpDescr = mmdata->matSpBDescr; 2652 #endif 2653 } 2654 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2655 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2656 /* precompute flops count */ 2657 if (ptype == MATPRODUCT_AB) { 2658 for (i=0, flops = 0; i<A->rmap->n; i++) { 2659 const PetscInt st = a->i[i]; 2660 const PetscInt en = a->i[i+1]; 2661 for (j=st; j<en; j++) { 2662 const PetscInt brow = a->j[j]; 2663 flops += 2.*(b->i[brow+1] - b->i[brow]); 2664 } 2665 } 2666 } else if (ptype == MATPRODUCT_AtB) { 2667 for (i=0, flops = 0; i<A->rmap->n; i++) { 2668 const PetscInt anzi = a->i[i+1] - a->i[i]; 2669 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2670 flops += (2.*anzi)*bnzi; 2671 } 2672 } else { /* TODO */ 2673 flops = 0.; 2674 } 2675 2676 mmdata->flops = flops; 2677 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2678 2679 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2680 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2681 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2682 NULL, NULL, NULL, 2683 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2684 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2685 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2686 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2687 { 2688 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2689 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2690 */ 2691 void* dBuffer1 = NULL; 2692 void* dBuffer2 = NULL; 2693 void* dBuffer3 = NULL; 2694 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2695 size_t bufferSize1 = 0; 2696 size_t bufferSize2 = 0; 2697 size_t bufferSize3 = 0; 2698 size_t bufferSize4 = 0; 2699 size_t bufferSize5 = 0; 2700 2701 /*----------------------------------------------------------------------*/ 2702 /* ask bufferSize1 bytes for external memory */ 2703 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2704 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2705 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2706 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2707 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2708 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2709 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2710 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2711 2712 /*----------------------------------------------------------------------*/ 2713 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2714 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2715 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2716 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2717 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2718 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2719 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2720 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2721 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2722 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2723 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2724 2725 /*----------------------------------------------------------------------*/ 2726 /* get matrix C non-zero entries C_nnz1 */ 2727 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2728 c->nz = (PetscInt) C_nnz1; 2729 /* allocate matrix C */ 2730 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2731 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2732 /* update matC with the new pointers */ 2733 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2734 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2735 2736 /*----------------------------------------------------------------------*/ 2737 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2738 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2739 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2740 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2741 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2742 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2743 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2744 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2745 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2746 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2747 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2748 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2749 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2750 } 2751 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2752 size_t bufSize2; 2753 /* ask bufferSize bytes for external memory */ 2754 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2755 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2756 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2757 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2758 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2759 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2760 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2761 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2762 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2763 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2764 /* ask bufferSize again bytes for external memory */ 2765 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2766 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2767 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2768 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2769 /* The CUSPARSE documentation is not clear, nor the API 2770 We need both buffers to perform the operations properly! 2771 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2772 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2773 is stored in the descriptor! What a messy API... */ 2774 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2775 /* compute the intermediate product of A * B */ 2776 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2777 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2778 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2779 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2780 /* get matrix C non-zero entries C_nnz1 */ 2781 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2782 c->nz = (PetscInt) C_nnz1; 2783 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2784 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2785 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2786 Ccsr->values = new THRUSTARRAY(c->nz); 2787 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2788 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2789 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2790 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2791 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2792 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2793 #endif 2794 #else 2795 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2796 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2797 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2798 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2799 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2800 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2801 c->nz = cnz; 2802 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2803 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2804 Ccsr->values = new THRUSTARRAY(c->nz); 2805 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2806 2807 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2808 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2809 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2810 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2811 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2812 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2813 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2814 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2815 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2816 #endif 2817 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2818 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2819 finalizesym: 2820 c->singlemalloc = PETSC_FALSE; 2821 c->free_a = PETSC_TRUE; 2822 c->free_ij = PETSC_TRUE; 2823 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2824 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2825 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2826 PetscInt *d_i = c->i; 2827 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2828 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2829 ii = *Ccsr->row_offsets; 2830 jj = *Ccsr->column_indices; 2831 if (ciscompressed) d_i = c->compressedrow.i; 2832 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2833 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2834 } else { 2835 PetscInt *d_i = c->i; 2836 if (ciscompressed) d_i = c->compressedrow.i; 2837 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2838 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2839 } 2840 if (ciscompressed) { /* need to expand host row offsets */ 2841 PetscInt r = 0; 2842 c->i[0] = 0; 2843 for (k = 0; k < c->compressedrow.nrows; k++) { 2844 const PetscInt next = c->compressedrow.rindex[k]; 2845 const PetscInt old = c->compressedrow.i[k]; 2846 for (; r < next; r++) c->i[r+1] = old; 2847 } 2848 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2849 } 2850 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2851 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2852 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2853 c->maxnz = c->nz; 2854 c->nonzerorowcnt = 0; 2855 c->rmax = 0; 2856 for (k = 0; k < m; k++) { 2857 const PetscInt nn = c->i[k+1] - c->i[k]; 2858 c->ilen[k] = c->imax[k] = nn; 2859 c->nonzerorowcnt += (PetscInt)!!nn; 2860 c->rmax = PetscMax(c->rmax,nn); 2861 } 2862 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2863 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2864 Ccsr->num_entries = c->nz; 2865 2866 C->nonzerostate++; 2867 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2868 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2869 Ccusp->nonzerostate = C->nonzerostate; 2870 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2871 C->preallocated = PETSC_TRUE; 2872 C->assembled = PETSC_FALSE; 2873 C->was_assembled = PETSC_FALSE; 2874 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2875 mmdata->reusesym = PETSC_TRUE; 2876 C->offloadmask = PETSC_OFFLOAD_GPU; 2877 } 2878 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2879 PetscFunctionReturn(0); 2880 } 2881 2882 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2883 2884 /* handles sparse or dense B */ 2885 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2886 { 2887 Mat_Product *product = mat->product; 2888 PetscErrorCode ierr; 2889 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2890 2891 PetscFunctionBegin; 2892 MatCheckProduct(mat,1); 2893 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2894 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2895 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2896 } 2897 if (product->type == MATPRODUCT_ABC) { 2898 Ciscusp = PETSC_FALSE; 2899 if (!product->C->boundtocpu) { 2900 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2901 } 2902 } 2903 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2904 PetscBool usecpu = PETSC_FALSE; 2905 switch (product->type) { 2906 case MATPRODUCT_AB: 2907 if (product->api_user) { 2908 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2909 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2910 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2911 } else { 2912 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2913 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2914 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2915 } 2916 break; 2917 case MATPRODUCT_AtB: 2918 if (product->api_user) { 2919 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2920 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2921 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2922 } else { 2923 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2924 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2925 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2926 } 2927 break; 2928 case MATPRODUCT_PtAP: 2929 if (product->api_user) { 2930 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2931 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2932 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2933 } else { 2934 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2935 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2936 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2937 } 2938 break; 2939 case MATPRODUCT_RARt: 2940 if (product->api_user) { 2941 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2942 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2943 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2944 } else { 2945 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2946 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2947 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2948 } 2949 break; 2950 case MATPRODUCT_ABC: 2951 if (product->api_user) { 2952 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2953 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2954 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2955 } else { 2956 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2957 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2958 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2959 } 2960 break; 2961 default: 2962 break; 2963 } 2964 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2965 } 2966 /* dispatch */ 2967 if (isdense) { 2968 switch (product->type) { 2969 case MATPRODUCT_AB: 2970 case MATPRODUCT_AtB: 2971 case MATPRODUCT_ABt: 2972 case MATPRODUCT_PtAP: 2973 case MATPRODUCT_RARt: 2974 if (product->A->boundtocpu) { 2975 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2976 } else { 2977 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2978 } 2979 break; 2980 case MATPRODUCT_ABC: 2981 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2982 break; 2983 default: 2984 break; 2985 } 2986 } else if (Biscusp && Ciscusp) { 2987 switch (product->type) { 2988 case MATPRODUCT_AB: 2989 case MATPRODUCT_AtB: 2990 case MATPRODUCT_ABt: 2991 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2992 break; 2993 case MATPRODUCT_PtAP: 2994 case MATPRODUCT_RARt: 2995 case MATPRODUCT_ABC: 2996 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2997 break; 2998 default: 2999 break; 3000 } 3001 } else { /* fallback for AIJ */ 3002 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3003 } 3004 PetscFunctionReturn(0); 3005 } 3006 3007 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3008 { 3009 PetscErrorCode ierr; 3010 3011 PetscFunctionBegin; 3012 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3013 PetscFunctionReturn(0); 3014 } 3015 3016 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3017 { 3018 PetscErrorCode ierr; 3019 3020 PetscFunctionBegin; 3021 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3022 PetscFunctionReturn(0); 3023 } 3024 3025 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3026 { 3027 PetscErrorCode ierr; 3028 3029 PetscFunctionBegin; 3030 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3031 PetscFunctionReturn(0); 3032 } 3033 3034 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3035 { 3036 PetscErrorCode ierr; 3037 3038 PetscFunctionBegin; 3039 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3040 PetscFunctionReturn(0); 3041 } 3042 3043 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3044 { 3045 PetscErrorCode ierr; 3046 3047 PetscFunctionBegin; 3048 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3049 PetscFunctionReturn(0); 3050 } 3051 3052 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3053 { 3054 int i = blockIdx.x*blockDim.x + threadIdx.x; 3055 if (i < n) y[idx[i]] += x[i]; 3056 } 3057 3058 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3059 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3060 { 3061 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3062 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3063 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3064 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3065 PetscErrorCode ierr; 3066 cusparseStatus_t stat; 3067 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3068 PetscBool compressed; 3069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3070 PetscInt nx,ny; 3071 #endif 3072 3073 PetscFunctionBegin; 3074 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3075 if (!a->nonzerorowcnt) { 3076 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3077 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3078 PetscFunctionReturn(0); 3079 } 3080 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3081 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3082 if (!trans) { 3083 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3084 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3085 } else { 3086 if (herm || !A->form_explicit_transpose) { 3087 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3088 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3089 } else { 3090 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3091 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3092 } 3093 } 3094 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3095 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3096 3097 try { 3098 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3099 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3100 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3101 3102 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3103 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3104 /* z = A x + beta y. 3105 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3106 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3107 */ 3108 xptr = xarray; 3109 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3110 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3111 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3112 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3113 allocated to accommodate different uses. So we get the length info directly from mat. 3114 */ 3115 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3116 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3117 nx = mat->num_cols; 3118 ny = mat->num_rows; 3119 } 3120 #endif 3121 } else { 3122 /* z = A^T x + beta y 3123 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3124 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3125 */ 3126 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3127 dptr = zarray; 3128 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3129 if (compressed) { /* Scatter x to work vector */ 3130 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3131 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3132 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3133 VecCUDAEqualsReverse()); 3134 } 3135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3136 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3137 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3138 nx = mat->num_rows; 3139 ny = mat->num_cols; 3140 } 3141 #endif 3142 } 3143 3144 /* csr_spmv does y = alpha op(A) x + beta y */ 3145 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3147 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3148 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3149 cudaError_t cerr; 3150 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3151 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3152 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3153 matstruct->matDescr, 3154 matstruct->cuSpMV[opA].vecXDescr, beta, 3155 matstruct->cuSpMV[opA].vecYDescr, 3156 cusparse_scalartype, 3157 cusparsestruct->spmvAlg, 3158 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3159 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3160 3161 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3162 } else { 3163 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3164 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3165 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3166 } 3167 3168 stat = cusparseSpMV(cusparsestruct->handle, opA, 3169 matstruct->alpha_one, 3170 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3171 matstruct->cuSpMV[opA].vecXDescr, 3172 beta, 3173 matstruct->cuSpMV[opA].vecYDescr, 3174 cusparse_scalartype, 3175 cusparsestruct->spmvAlg, 3176 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3177 #else 3178 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3179 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3180 mat->num_rows, mat->num_cols, 3181 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3182 mat->values->data().get(), mat->row_offsets->data().get(), 3183 mat->column_indices->data().get(), xptr, beta, 3184 dptr);CHKERRCUSPARSE(stat); 3185 #endif 3186 } else { 3187 if (cusparsestruct->nrows) { 3188 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3189 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3190 #else 3191 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3192 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3193 matstruct->alpha_one, matstruct->descr, hybMat, 3194 xptr, beta, 3195 dptr);CHKERRCUSPARSE(stat); 3196 #endif 3197 } 3198 } 3199 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3200 3201 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3202 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3203 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3204 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3205 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3206 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3207 } 3208 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3209 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3210 } 3211 3212 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3213 if (compressed) { 3214 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3215 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3216 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3217 prevent that. So I just add a ScatterAdd kernel. 3218 */ 3219 #if 0 3220 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3221 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3222 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3223 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3224 VecCUDAPlusEquals()); 3225 #else 3226 PetscInt n = matstruct->cprowIndices->size(); 3227 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3228 #endif 3229 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3230 } 3231 } else { 3232 if (yy && yy != zz) { 3233 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3234 } 3235 } 3236 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3237 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3238 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3239 } catch(char *ex) { 3240 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3241 } 3242 if (yy) { 3243 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3244 } else { 3245 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3246 } 3247 PetscFunctionReturn(0); 3248 } 3249 3250 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3251 { 3252 PetscErrorCode ierr; 3253 3254 PetscFunctionBegin; 3255 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3256 PetscFunctionReturn(0); 3257 } 3258 3259 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3260 { 3261 PetscErrorCode ierr; 3262 PetscObjectState onnz = A->nonzerostate; 3263 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3264 3265 PetscFunctionBegin; 3266 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3267 if (onnz != A->nonzerostate && cusp->deviceMat) { 3268 cudaError_t cerr; 3269 3270 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3271 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3272 cusp->deviceMat = NULL; 3273 } 3274 PetscFunctionReturn(0); 3275 } 3276 3277 /* --------------------------------------------------------------------------------*/ 3278 /*@ 3279 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3280 (the default parallel PETSc format). This matrix will ultimately pushed down 3281 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3282 assembly performance the user should preallocate the matrix storage by setting 3283 the parameter nz (or the array nnz). By setting these parameters accurately, 3284 performance during matrix assembly can be increased by more than a factor of 50. 3285 3286 Collective 3287 3288 Input Parameters: 3289 + comm - MPI communicator, set to PETSC_COMM_SELF 3290 . m - number of rows 3291 . n - number of columns 3292 . nz - number of nonzeros per row (same for all rows) 3293 - nnz - array containing the number of nonzeros in the various rows 3294 (possibly different for each row) or NULL 3295 3296 Output Parameter: 3297 . A - the matrix 3298 3299 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3300 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3301 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3302 3303 Notes: 3304 If nnz is given then nz is ignored 3305 3306 The AIJ format (also called the Yale sparse matrix format or 3307 compressed row storage), is fully compatible with standard Fortran 77 3308 storage. That is, the stored row and column indices can begin at 3309 either one (as in Fortran) or zero. See the users' manual for details. 3310 3311 Specify the preallocated storage with either nz or nnz (not both). 3312 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3313 allocation. For large problems you MUST preallocate memory or you 3314 will get TERRIBLE performance, see the users' manual chapter on matrices. 3315 3316 By default, this format uses inodes (identical nodes) when possible, to 3317 improve numerical efficiency of matrix-vector products and solves. We 3318 search for consecutive rows with the same nonzero structure, thereby 3319 reusing matrix information to achieve increased efficiency. 3320 3321 Level: intermediate 3322 3323 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3324 @*/ 3325 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3326 { 3327 PetscErrorCode ierr; 3328 3329 PetscFunctionBegin; 3330 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3331 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3332 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3333 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3334 PetscFunctionReturn(0); 3335 } 3336 3337 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3338 { 3339 PetscErrorCode ierr; 3340 3341 PetscFunctionBegin; 3342 if (A->factortype == MAT_FACTOR_NONE) { 3343 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3344 } else { 3345 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3346 } 3347 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3348 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3349 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3350 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3351 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3357 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3358 PetscFunctionReturn(0); 3359 } 3360 3361 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3362 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3363 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3364 { 3365 PetscErrorCode ierr; 3366 3367 PetscFunctionBegin; 3368 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3369 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3370 PetscFunctionReturn(0); 3371 } 3372 3373 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3374 { 3375 PetscErrorCode ierr; 3376 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3377 Mat_SeqAIJCUSPARSE *cy; 3378 Mat_SeqAIJCUSPARSE *cx; 3379 PetscScalar *ay; 3380 const PetscScalar *ax; 3381 CsrMatrix *csry,*csrx; 3382 3383 PetscFunctionBegin; 3384 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3385 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3386 if (X->ops->axpy != Y->ops->axpy) { 3387 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3388 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3389 PetscFunctionReturn(0); 3390 } 3391 /* if we are here, it means both matrices are bound to GPU */ 3392 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3393 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3394 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3395 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3396 csry = (CsrMatrix*)cy->mat->mat; 3397 csrx = (CsrMatrix*)cx->mat->mat; 3398 /* see if we can turn this into a cublas axpy */ 3399 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3400 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3401 if (eq) { 3402 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3403 } 3404 if (eq) str = SAME_NONZERO_PATTERN; 3405 } 3406 /* spgeam is buggy with one column */ 3407 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3408 3409 if (str == SUBSET_NONZERO_PATTERN) { 3410 cusparseStatus_t stat; 3411 PetscScalar b = 1.0; 3412 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3413 size_t bufferSize; 3414 void *buffer; 3415 cudaError_t cerr; 3416 #endif 3417 3418 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3419 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3420 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3421 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3422 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3423 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3424 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3425 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3426 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3427 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3428 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3429 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3430 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3431 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3432 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3433 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3434 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3435 #else 3436 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3437 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3438 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3439 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3440 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3441 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3442 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3443 #endif 3444 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3445 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3446 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3447 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3448 } else if (str == SAME_NONZERO_PATTERN) { 3449 cublasHandle_t cublasv2handle; 3450 cublasStatus_t berr; 3451 PetscBLASInt one = 1, bnz = 1; 3452 3453 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3454 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3455 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3456 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3457 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3458 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3459 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3460 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3461 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3462 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3463 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3464 } else { 3465 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3466 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3467 } 3468 PetscFunctionReturn(0); 3469 } 3470 3471 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3472 { 3473 PetscErrorCode ierr; 3474 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3475 PetscScalar *ay; 3476 cublasHandle_t cublasv2handle; 3477 cublasStatus_t berr; 3478 PetscBLASInt one = 1, bnz = 1; 3479 3480 PetscFunctionBegin; 3481 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3482 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3483 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3484 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3485 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3486 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3487 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3488 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3489 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3490 PetscFunctionReturn(0); 3491 } 3492 3493 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3494 { 3495 PetscErrorCode ierr; 3496 PetscBool both = PETSC_FALSE; 3497 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3498 3499 PetscFunctionBegin; 3500 if (A->factortype == MAT_FACTOR_NONE) { 3501 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3502 if (spptr->mat) { 3503 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3504 if (matrix->values) { 3505 both = PETSC_TRUE; 3506 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3507 } 3508 } 3509 if (spptr->matTranspose) { 3510 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3511 if (matrix->values) { 3512 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3513 } 3514 } 3515 } 3516 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3517 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3518 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3519 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3520 else A->offloadmask = PETSC_OFFLOAD_CPU; 3521 PetscFunctionReturn(0); 3522 } 3523 3524 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3525 { 3526 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3527 PetscErrorCode ierr; 3528 3529 PetscFunctionBegin; 3530 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3531 if (flg) { 3532 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3533 3534 A->ops->scale = MatScale_SeqAIJ; 3535 A->ops->axpy = MatAXPY_SeqAIJ; 3536 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3537 A->ops->mult = MatMult_SeqAIJ; 3538 A->ops->multadd = MatMultAdd_SeqAIJ; 3539 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3540 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3541 A->ops->multhermitiantranspose = NULL; 3542 A->ops->multhermitiantransposeadd = NULL; 3543 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3544 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3545 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3546 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3547 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3548 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3549 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3552 } else { 3553 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3554 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3555 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3556 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3557 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3558 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3559 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3560 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3561 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3562 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3563 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3564 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3565 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3566 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3567 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3568 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3569 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3570 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3571 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3572 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3573 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3574 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3575 } 3576 A->boundtocpu = flg; 3577 if (flg && a->inode.size) { 3578 a->inode.use = PETSC_TRUE; 3579 } else { 3580 a->inode.use = PETSC_FALSE; 3581 } 3582 PetscFunctionReturn(0); 3583 } 3584 3585 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3586 { 3587 PetscErrorCode ierr; 3588 cusparseStatus_t stat; 3589 Mat B; 3590 3591 PetscFunctionBegin; 3592 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3593 if (reuse == MAT_INITIAL_MATRIX) { 3594 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3595 } else if (reuse == MAT_REUSE_MATRIX) { 3596 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3597 } 3598 B = *newmat; 3599 3600 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3601 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3602 3603 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3604 if (B->factortype == MAT_FACTOR_NONE) { 3605 Mat_SeqAIJCUSPARSE *spptr; 3606 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3607 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3608 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3609 spptr->format = MAT_CUSPARSE_CSR; 3610 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3611 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3612 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3613 #else 3614 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3615 #endif 3616 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3617 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3618 #endif 3619 B->spptr = spptr; 3620 } else { 3621 Mat_SeqAIJCUSPARSETriFactors *spptr; 3622 3623 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3624 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3625 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3626 B->spptr = spptr; 3627 } 3628 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3629 } 3630 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3631 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3632 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3633 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3634 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3635 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3636 3637 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3638 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3639 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3640 #if defined(PETSC_HAVE_HYPRE) 3641 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3642 #endif 3643 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 3644 PetscFunctionReturn(0); 3645 } 3646 3647 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3648 { 3649 PetscErrorCode ierr; 3650 3651 PetscFunctionBegin; 3652 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3653 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3654 PetscFunctionReturn(0); 3655 } 3656 3657 /*MC 3658 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3659 3660 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3661 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3662 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3663 3664 Options Database Keys: 3665 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3666 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3667 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3668 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3669 3670 Level: beginner 3671 3672 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3673 M*/ 3674 3675 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3676 3677 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3678 { 3679 PetscErrorCode ierr; 3680 3681 PetscFunctionBegin; 3682 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3683 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3684 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3685 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3686 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3687 3688 PetscFunctionReturn(0); 3689 } 3690 3691 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3692 { 3693 PetscErrorCode ierr; 3694 cusparseStatus_t stat; 3695 3696 PetscFunctionBegin; 3697 if (*cusparsestruct) { 3698 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3699 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3700 delete (*cusparsestruct)->workVector; 3701 delete (*cusparsestruct)->rowoffsets_gpu; 3702 delete (*cusparsestruct)->cooPerm; 3703 delete (*cusparsestruct)->cooPerm_a; 3704 delete (*cusparsestruct)->csr2csc_i; 3705 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3706 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3707 } 3708 PetscFunctionReturn(0); 3709 } 3710 3711 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3712 { 3713 PetscFunctionBegin; 3714 if (*mat) { 3715 delete (*mat)->values; 3716 delete (*mat)->column_indices; 3717 delete (*mat)->row_offsets; 3718 delete *mat; 3719 *mat = 0; 3720 } 3721 PetscFunctionReturn(0); 3722 } 3723 3724 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3725 { 3726 cusparseStatus_t stat; 3727 PetscErrorCode ierr; 3728 3729 PetscFunctionBegin; 3730 if (*trifactor) { 3731 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3732 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3733 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3734 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3735 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3736 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3737 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3738 #endif 3739 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3740 } 3741 PetscFunctionReturn(0); 3742 } 3743 3744 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3745 { 3746 CsrMatrix *mat; 3747 cusparseStatus_t stat; 3748 cudaError_t err; 3749 3750 PetscFunctionBegin; 3751 if (*matstruct) { 3752 if ((*matstruct)->mat) { 3753 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3754 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3755 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3756 #else 3757 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3758 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3759 #endif 3760 } else { 3761 mat = (CsrMatrix*)(*matstruct)->mat; 3762 CsrMatrix_Destroy(&mat); 3763 } 3764 } 3765 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3766 delete (*matstruct)->cprowIndices; 3767 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3768 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3769 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3770 3771 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3772 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3773 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3774 for (int i=0; i<3; i++) { 3775 if (mdata->cuSpMV[i].initialized) { 3776 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3777 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3778 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3779 } 3780 } 3781 #endif 3782 delete *matstruct; 3783 *matstruct = NULL; 3784 } 3785 PetscFunctionReturn(0); 3786 } 3787 3788 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3789 { 3790 PetscErrorCode ierr; 3791 3792 PetscFunctionBegin; 3793 if (*trifactors) { 3794 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3795 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3796 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3797 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3798 delete (*trifactors)->rpermIndices; 3799 delete (*trifactors)->cpermIndices; 3800 delete (*trifactors)->workVector; 3801 (*trifactors)->rpermIndices = NULL; 3802 (*trifactors)->cpermIndices = NULL; 3803 (*trifactors)->workVector = NULL; 3804 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3805 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3806 (*trifactors)->init_dev_prop = PETSC_FALSE; 3807 } 3808 PetscFunctionReturn(0); 3809 } 3810 3811 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3812 { 3813 PetscErrorCode ierr; 3814 cusparseHandle_t handle; 3815 cusparseStatus_t stat; 3816 3817 PetscFunctionBegin; 3818 if (*trifactors) { 3819 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3820 if (handle = (*trifactors)->handle) { 3821 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3822 } 3823 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3824 } 3825 PetscFunctionReturn(0); 3826 } 3827 3828 struct IJCompare 3829 { 3830 __host__ __device__ 3831 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3832 { 3833 if (t1.get<0>() < t2.get<0>()) return true; 3834 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3835 return false; 3836 } 3837 }; 3838 3839 struct IJEqual 3840 { 3841 __host__ __device__ 3842 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3843 { 3844 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3845 return true; 3846 } 3847 }; 3848 3849 struct IJDiff 3850 { 3851 __host__ __device__ 3852 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3853 { 3854 return t1 == t2 ? 0 : 1; 3855 } 3856 }; 3857 3858 struct IJSum 3859 { 3860 __host__ __device__ 3861 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3862 { 3863 return t1||t2; 3864 } 3865 }; 3866 3867 #include <thrust/iterator/discard_iterator.h> 3868 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3869 { 3870 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3871 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3872 THRUSTARRAY *cooPerm_v = NULL; 3873 thrust::device_ptr<const PetscScalar> d_v; 3874 CsrMatrix *matrix; 3875 PetscErrorCode ierr; 3876 PetscInt n; 3877 3878 PetscFunctionBegin; 3879 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3880 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3881 if (!cusp->cooPerm) { 3882 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3883 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3884 PetscFunctionReturn(0); 3885 } 3886 matrix = (CsrMatrix*)cusp->mat->mat; 3887 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3888 if (!v) { 3889 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3890 goto finalize; 3891 } 3892 n = cusp->cooPerm->size(); 3893 if (isCudaMem(v)) { 3894 d_v = thrust::device_pointer_cast(v); 3895 } else { 3896 cooPerm_v = new THRUSTARRAY(n); 3897 cooPerm_v->assign(v,v+n); 3898 d_v = cooPerm_v->data(); 3899 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3900 } 3901 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3902 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3903 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3904 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3905 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3906 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3907 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3908 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3909 */ 3910 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3911 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3912 delete cooPerm_w; 3913 } else { 3914 /* all nonzeros in d_v[] are unique entries */ 3915 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3916 matrix->values->begin())); 3917 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3918 matrix->values->end())); 3919 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3920 } 3921 } else { 3922 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3923 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3924 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3925 } else { 3926 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3927 matrix->values->begin())); 3928 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3929 matrix->values->end())); 3930 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3931 } 3932 } 3933 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3934 finalize: 3935 delete cooPerm_v; 3936 A->offloadmask = PETSC_OFFLOAD_GPU; 3937 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3938 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3939 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3940 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3941 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3942 a->reallocs = 0; 3943 A->info.mallocs += 0; 3944 A->info.nz_unneeded = 0; 3945 A->assembled = A->was_assembled = PETSC_TRUE; 3946 A->num_ass++; 3947 PetscFunctionReturn(0); 3948 } 3949 3950 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3951 { 3952 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3953 PetscErrorCode ierr; 3954 3955 PetscFunctionBegin; 3956 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3957 if (!cusp) PetscFunctionReturn(0); 3958 if (destroy) { 3959 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3960 delete cusp->csr2csc_i; 3961 cusp->csr2csc_i = NULL; 3962 } 3963 A->transupdated = PETSC_FALSE; 3964 PetscFunctionReturn(0); 3965 } 3966 3967 #include <thrust/binary_search.h> 3968 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3969 { 3970 PetscErrorCode ierr; 3971 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3972 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3973 PetscInt cooPerm_n, nzr = 0; 3974 cudaError_t cerr; 3975 3976 PetscFunctionBegin; 3977 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3978 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3979 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3980 if (n != cooPerm_n) { 3981 delete cusp->cooPerm; 3982 delete cusp->cooPerm_a; 3983 cusp->cooPerm = NULL; 3984 cusp->cooPerm_a = NULL; 3985 } 3986 if (n) { 3987 THRUSTINTARRAY d_i(n); 3988 THRUSTINTARRAY d_j(n); 3989 THRUSTINTARRAY ii(A->rmap->n); 3990 3991 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3992 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3993 3994 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3995 d_i.assign(coo_i,coo_i+n); 3996 d_j.assign(coo_j,coo_j+n); 3997 3998 /* Ex. 3999 n = 6 4000 coo_i = [3,3,1,4,1,4] 4001 coo_j = [3,2,2,5,2,6] 4002 */ 4003 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4004 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4005 4006 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4007 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4008 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4009 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4010 THRUSTINTARRAY w = d_j; 4011 4012 /* 4013 d_i = [1,1,3,3,4,4] 4014 d_j = [2,2,2,3,5,6] 4015 cooPerm = [2,4,1,0,3,5] 4016 */ 4017 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4018 4019 /* 4020 d_i = [1,3,3,4,4,x] 4021 ^ekey 4022 d_j = [2,2,3,5,6,x] 4023 ^nekye 4024 */ 4025 if (nekey == ekey) { /* all entries are unique */ 4026 delete cusp->cooPerm_a; 4027 cusp->cooPerm_a = NULL; 4028 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4029 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4030 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4031 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4032 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4033 w[0] = 0; 4034 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4035 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4036 } 4037 thrust::counting_iterator<PetscInt> search_begin(0); 4038 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4039 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4040 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4041 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4042 4043 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 4044 a->singlemalloc = PETSC_FALSE; 4045 a->free_a = PETSC_TRUE; 4046 a->free_ij = PETSC_TRUE; 4047 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4048 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4049 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4050 a->nz = a->maxnz = a->i[A->rmap->n]; 4051 a->rmax = 0; 4052 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4053 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4054 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4055 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4056 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4057 for (PetscInt i = 0; i < A->rmap->n; i++) { 4058 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4059 nzr += (PetscInt)!!(nnzr); 4060 a->ilen[i] = a->imax[i] = nnzr; 4061 a->rmax = PetscMax(a->rmax,nnzr); 4062 } 4063 a->nonzerorowcnt = nzr; 4064 A->preallocated = PETSC_TRUE; 4065 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4066 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4067 } else { 4068 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4069 } 4070 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4071 4072 /* We want to allocate the CUSPARSE struct for matvec now. 4073 The code is so convoluted now that I prefer to copy zeros */ 4074 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4075 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4076 A->offloadmask = PETSC_OFFLOAD_CPU; 4077 A->nonzerostate++; 4078 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4079 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4080 4081 A->assembled = PETSC_FALSE; 4082 A->was_assembled = PETSC_FALSE; 4083 PetscFunctionReturn(0); 4084 } 4085 4086 /*@C 4087 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4088 4089 Not collective 4090 4091 Input Parameters: 4092 + A - the matrix 4093 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4094 4095 Output Parameters: 4096 + ia - the CSR row pointers 4097 - ja - the CSR column indices 4098 4099 Level: developer 4100 4101 Notes: 4102 When compressed is true, the CSR structure does not contain empty rows 4103 4104 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4105 @*/ 4106 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4107 { 4108 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4109 CsrMatrix *csr; 4110 PetscErrorCode ierr; 4111 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4112 4113 PetscFunctionBegin; 4114 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4115 if (!i || !j) PetscFunctionReturn(0); 4116 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4117 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4118 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4119 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4120 csr = (CsrMatrix*)cusp->mat->mat; 4121 if (i) { 4122 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4123 if (!cusp->rowoffsets_gpu) { 4124 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4125 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4126 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4127 } 4128 *i = cusp->rowoffsets_gpu->data().get(); 4129 } else *i = csr->row_offsets->data().get(); 4130 } 4131 if (j) *j = csr->column_indices->data().get(); 4132 PetscFunctionReturn(0); 4133 } 4134 4135 /*@C 4136 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4137 4138 Not collective 4139 4140 Input Parameters: 4141 + A - the matrix 4142 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4143 4144 Output Parameters: 4145 + ia - the CSR row pointers 4146 - ja - the CSR column indices 4147 4148 Level: developer 4149 4150 .seealso: MatSeqAIJCUSPARSEGetIJ() 4151 @*/ 4152 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4153 { 4154 PetscFunctionBegin; 4155 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4156 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4157 if (i) *i = NULL; 4158 if (j) *j = NULL; 4159 PetscFunctionReturn(0); 4160 } 4161 4162 /*@C 4163 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4164 4165 Not Collective 4166 4167 Input Parameter: 4168 . A - a MATSEQAIJCUSPARSE matrix 4169 4170 Output Parameter: 4171 . a - pointer to the device data 4172 4173 Level: developer 4174 4175 Notes: may trigger host-device copies if up-to-date matrix data is on host 4176 4177 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4178 @*/ 4179 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4180 { 4181 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4182 CsrMatrix *csr; 4183 PetscErrorCode ierr; 4184 4185 PetscFunctionBegin; 4186 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4187 PetscValidPointer(a,2); 4188 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4189 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4190 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4191 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4192 csr = (CsrMatrix*)cusp->mat->mat; 4193 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4194 *a = csr->values->data().get(); 4195 PetscFunctionReturn(0); 4196 } 4197 4198 /*@C 4199 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4200 4201 Not Collective 4202 4203 Input Parameter: 4204 . A - a MATSEQAIJCUSPARSE matrix 4205 4206 Output Parameter: 4207 . a - pointer to the device data 4208 4209 Level: developer 4210 4211 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4212 @*/ 4213 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4214 { 4215 PetscFunctionBegin; 4216 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4217 PetscValidPointer(a,2); 4218 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4219 *a = NULL; 4220 PetscFunctionReturn(0); 4221 } 4222 4223 /*@C 4224 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4225 4226 Not Collective 4227 4228 Input Parameter: 4229 . A - a MATSEQAIJCUSPARSE matrix 4230 4231 Output Parameter: 4232 . a - pointer to the device data 4233 4234 Level: developer 4235 4236 Notes: may trigger host-device copies if up-to-date matrix data is on host 4237 4238 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4239 @*/ 4240 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4241 { 4242 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4243 CsrMatrix *csr; 4244 PetscErrorCode ierr; 4245 4246 PetscFunctionBegin; 4247 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4248 PetscValidPointer(a,2); 4249 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4250 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4251 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4252 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4253 csr = (CsrMatrix*)cusp->mat->mat; 4254 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4255 *a = csr->values->data().get(); 4256 A->offloadmask = PETSC_OFFLOAD_GPU; 4257 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4258 PetscFunctionReturn(0); 4259 } 4260 /*@C 4261 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4262 4263 Not Collective 4264 4265 Input Parameter: 4266 . A - a MATSEQAIJCUSPARSE matrix 4267 4268 Output Parameter: 4269 . a - pointer to the device data 4270 4271 Level: developer 4272 4273 .seealso: MatSeqAIJCUSPARSEGetArray() 4274 @*/ 4275 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4276 { 4277 PetscErrorCode ierr; 4278 4279 PetscFunctionBegin; 4280 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4281 PetscValidPointer(a,2); 4282 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4283 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4284 *a = NULL; 4285 PetscFunctionReturn(0); 4286 } 4287 4288 /*@C 4289 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4290 4291 Not Collective 4292 4293 Input Parameter: 4294 . A - a MATSEQAIJCUSPARSE matrix 4295 4296 Output Parameter: 4297 . a - pointer to the device data 4298 4299 Level: developer 4300 4301 Notes: does not trigger host-device copies and flags data validity on the GPU 4302 4303 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4304 @*/ 4305 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4306 { 4307 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4308 CsrMatrix *csr; 4309 PetscErrorCode ierr; 4310 4311 PetscFunctionBegin; 4312 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4313 PetscValidPointer(a,2); 4314 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4315 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4316 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4317 csr = (CsrMatrix*)cusp->mat->mat; 4318 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4319 *a = csr->values->data().get(); 4320 A->offloadmask = PETSC_OFFLOAD_GPU; 4321 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4322 PetscFunctionReturn(0); 4323 } 4324 4325 /*@C 4326 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4327 4328 Not Collective 4329 4330 Input Parameter: 4331 . A - a MATSEQAIJCUSPARSE matrix 4332 4333 Output Parameter: 4334 . a - pointer to the device data 4335 4336 Level: developer 4337 4338 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4339 @*/ 4340 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4341 { 4342 PetscErrorCode ierr; 4343 4344 PetscFunctionBegin; 4345 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4346 PetscValidPointer(a,2); 4347 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4348 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4349 *a = NULL; 4350 PetscFunctionReturn(0); 4351 } 4352 4353 struct IJCompare4 4354 { 4355 __host__ __device__ 4356 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4357 { 4358 if (t1.get<0>() < t2.get<0>()) return true; 4359 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4360 return false; 4361 } 4362 }; 4363 4364 struct Shift 4365 { 4366 int _shift; 4367 4368 Shift(int shift) : _shift(shift) {} 4369 __host__ __device__ 4370 inline int operator() (const int &c) 4371 { 4372 return c + _shift; 4373 } 4374 }; 4375 4376 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4377 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4378 { 4379 PetscErrorCode ierr; 4380 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4381 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4382 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4383 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4384 PetscInt Annz,Bnnz; 4385 cusparseStatus_t stat; 4386 PetscInt i,m,n,zero = 0; 4387 cudaError_t cerr; 4388 4389 PetscFunctionBegin; 4390 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4391 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4392 PetscValidPointer(C,4); 4393 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4394 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4395 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4396 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4397 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4398 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4399 if (reuse == MAT_INITIAL_MATRIX) { 4400 m = A->rmap->n; 4401 n = A->cmap->n + B->cmap->n; 4402 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4403 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4404 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4405 c = (Mat_SeqAIJ*)(*C)->data; 4406 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4407 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4408 Ccsr = new CsrMatrix; 4409 Cmat->cprowIndices = NULL; 4410 c->compressedrow.use = PETSC_FALSE; 4411 c->compressedrow.nrows = 0; 4412 c->compressedrow.i = NULL; 4413 c->compressedrow.rindex = NULL; 4414 Ccusp->workVector = NULL; 4415 Ccusp->nrows = m; 4416 Ccusp->mat = Cmat; 4417 Ccusp->mat->mat = Ccsr; 4418 Ccsr->num_rows = m; 4419 Ccsr->num_cols = n; 4420 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4421 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4422 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4423 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4424 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4425 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4426 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4427 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4428 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4429 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4430 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4431 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4432 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4433 4434 Acsr = (CsrMatrix*)Acusp->mat->mat; 4435 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4436 Annz = (PetscInt)Acsr->column_indices->size(); 4437 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4438 c->nz = Annz + Bnnz; 4439 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4440 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4441 Ccsr->values = new THRUSTARRAY(c->nz); 4442 Ccsr->num_entries = c->nz; 4443 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4444 if (c->nz) { 4445 auto Acoo = new THRUSTINTARRAY32(Annz); 4446 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4447 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4448 THRUSTINTARRAY32 *Aroff,*Broff; 4449 4450 if (a->compressedrow.use) { /* need full row offset */ 4451 if (!Acusp->rowoffsets_gpu) { 4452 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4453 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4454 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4455 } 4456 Aroff = Acusp->rowoffsets_gpu; 4457 } else Aroff = Acsr->row_offsets; 4458 if (b->compressedrow.use) { /* need full row offset */ 4459 if (!Bcusp->rowoffsets_gpu) { 4460 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4461 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4462 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4463 } 4464 Broff = Bcusp->rowoffsets_gpu; 4465 } else Broff = Bcsr->row_offsets; 4466 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4467 stat = cusparseXcsr2coo(Acusp->handle, 4468 Aroff->data().get(), 4469 Annz, 4470 m, 4471 Acoo->data().get(), 4472 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4473 stat = cusparseXcsr2coo(Bcusp->handle, 4474 Broff->data().get(), 4475 Bnnz, 4476 m, 4477 Bcoo->data().get(), 4478 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4479 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4480 auto Aperm = thrust::make_constant_iterator(1); 4481 auto Bperm = thrust::make_constant_iterator(0); 4482 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4483 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4484 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4485 #else 4486 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4487 auto Bcib = Bcsr->column_indices->begin(); 4488 auto Bcie = Bcsr->column_indices->end(); 4489 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4490 #endif 4491 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4492 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4493 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4494 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4495 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4496 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4497 auto p1 = Ccusp->cooPerm->begin(); 4498 auto p2 = Ccusp->cooPerm->begin(); 4499 thrust::advance(p2,Annz); 4500 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4501 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4502 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4503 #endif 4504 auto cci = thrust::make_counting_iterator(zero); 4505 auto cce = thrust::make_counting_iterator(c->nz); 4506 #if 0 //Errors on SUMMIT cuda 11.1.0 4507 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4508 #else 4509 auto pred = thrust::identity<int>(); 4510 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4511 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4512 #endif 4513 stat = cusparseXcoo2csr(Ccusp->handle, 4514 Ccoo->data().get(), 4515 c->nz, 4516 m, 4517 Ccsr->row_offsets->data().get(), 4518 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4519 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4520 delete wPerm; 4521 delete Acoo; 4522 delete Bcoo; 4523 delete Ccoo; 4524 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4525 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4526 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4527 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4528 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4529 #endif 4530 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4531 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4532 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4533 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4534 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4535 CsrMatrix *CcsrT = new CsrMatrix; 4536 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4537 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4538 4539 (*C)->form_explicit_transpose = PETSC_TRUE; 4540 (*C)->transupdated = PETSC_TRUE; 4541 Ccusp->rowoffsets_gpu = NULL; 4542 CmatT->cprowIndices = NULL; 4543 CmatT->mat = CcsrT; 4544 CcsrT->num_rows = n; 4545 CcsrT->num_cols = m; 4546 CcsrT->num_entries = c->nz; 4547 4548 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4549 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4550 CcsrT->values = new THRUSTARRAY(c->nz); 4551 4552 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4553 auto rT = CcsrT->row_offsets->begin(); 4554 if (AT) { 4555 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4556 thrust::advance(rT,-1); 4557 } 4558 if (BT) { 4559 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4560 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4561 thrust::copy(titb,tite,rT); 4562 } 4563 auto cT = CcsrT->column_indices->begin(); 4564 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4565 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4566 auto vT = CcsrT->values->begin(); 4567 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4568 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4569 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4570 4571 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4572 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4573 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4574 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4575 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4576 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4577 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4578 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4579 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4580 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4581 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4582 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4583 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4584 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4585 #endif 4586 Ccusp->matTranspose = CmatT; 4587 } 4588 } 4589 4590 c->singlemalloc = PETSC_FALSE; 4591 c->free_a = PETSC_TRUE; 4592 c->free_ij = PETSC_TRUE; 4593 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4594 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4595 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4596 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4597 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4598 ii = *Ccsr->row_offsets; 4599 jj = *Ccsr->column_indices; 4600 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4601 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4602 } else { 4603 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4604 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4605 } 4606 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4607 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4608 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4609 c->maxnz = c->nz; 4610 c->nonzerorowcnt = 0; 4611 c->rmax = 0; 4612 for (i = 0; i < m; i++) { 4613 const PetscInt nn = c->i[i+1] - c->i[i]; 4614 c->ilen[i] = c->imax[i] = nn; 4615 c->nonzerorowcnt += (PetscInt)!!nn; 4616 c->rmax = PetscMax(c->rmax,nn); 4617 } 4618 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4619 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4620 (*C)->nonzerostate++; 4621 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4622 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4623 Ccusp->nonzerostate = (*C)->nonzerostate; 4624 (*C)->preallocated = PETSC_TRUE; 4625 } else { 4626 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4627 c = (Mat_SeqAIJ*)(*C)->data; 4628 if (c->nz) { 4629 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4630 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4631 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4632 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4633 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4634 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4635 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4636 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4637 Acsr = (CsrMatrix*)Acusp->mat->mat; 4638 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4639 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4640 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4641 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4642 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4643 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4644 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4645 auto pmid = Ccusp->cooPerm->begin(); 4646 thrust::advance(pmid,Acsr->num_entries); 4647 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4648 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4649 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4650 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4651 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4652 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4653 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4654 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4655 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4656 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4657 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4658 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4659 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4660 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4661 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4662 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4663 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4664 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4665 auto vT = CcsrT->values->begin(); 4666 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4667 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4668 (*C)->transupdated = PETSC_TRUE; 4669 } 4670 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4671 } 4672 } 4673 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4674 (*C)->assembled = PETSC_TRUE; 4675 (*C)->was_assembled = PETSC_FALSE; 4676 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4677 PetscFunctionReturn(0); 4678 } 4679 4680 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4681 { 4682 PetscErrorCode ierr; 4683 bool dmem; 4684 const PetscScalar *av; 4685 cudaError_t cerr; 4686 4687 PetscFunctionBegin; 4688 dmem = isCudaMem(v); 4689 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4690 if (n && idx) { 4691 THRUSTINTARRAY widx(n); 4692 widx.assign(idx,idx+n); 4693 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4694 4695 THRUSTARRAY *w = NULL; 4696 thrust::device_ptr<PetscScalar> dv; 4697 if (dmem) { 4698 dv = thrust::device_pointer_cast(v); 4699 } else { 4700 w = new THRUSTARRAY(n); 4701 dv = w->data(); 4702 } 4703 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4704 4705 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4706 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4707 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4708 if (w) { 4709 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4710 } 4711 delete w; 4712 } else { 4713 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4714 } 4715 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4716 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4717 PetscFunctionReturn(0); 4718 } 4719