1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 166 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 167 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 168 if (!A->boundtocpu) { 169 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 170 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 171 } else { 172 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 173 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 174 } 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 178 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 179 if (!A->boundtocpu) { 180 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 181 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 182 } else { 183 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 184 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 185 } 186 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 187 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 188 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 189 190 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 191 (*B)->canuseordering = PETSC_TRUE; 192 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 197 { 198 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 199 200 PetscFunctionBegin; 201 switch (op) { 202 case MAT_CUSPARSE_MULT: 203 cusparsestruct->format = format; 204 break; 205 case MAT_CUSPARSE_ALL: 206 cusparsestruct->format = format; 207 break; 208 default: 209 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 210 } 211 PetscFunctionReturn(0); 212 } 213 214 /*@ 215 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 216 operation. Only the MatMult operation can use different GPU storage formats 217 for MPIAIJCUSPARSE matrices. 218 Not Collective 219 220 Input Parameters: 221 + A - Matrix of type SEQAIJCUSPARSE 222 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 223 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 224 225 Output Parameter: 226 227 Level: intermediate 228 229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 230 @*/ 231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 237 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 242 { 243 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 244 245 PetscFunctionBegin; 246 cusparsestruct->use_cpu_solve = use_cpu; 247 PetscFunctionReturn(0); 248 } 249 250 /*@ 251 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 252 253 Input Parameters: 254 + A - Matrix of type SEQAIJCUSPARSE 255 - use_cpu - set flag for using the built-in CPU MatSolve 256 257 Output Parameter: 258 259 Notes: 260 The cuSparse LU solver currently computes the factors with the built-in CPU method 261 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 262 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 263 264 Level: intermediate 265 266 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 267 @*/ 268 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 269 { 270 PetscErrorCode ierr; 271 272 PetscFunctionBegin; 273 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 274 ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 275 PetscFunctionReturn(0); 276 } 277 278 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 279 { 280 PetscErrorCode ierr; 281 282 PetscFunctionBegin; 283 switch (op) { 284 case MAT_FORM_EXPLICIT_TRANSPOSE: 285 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 286 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 287 A->form_explicit_transpose = flg; 288 break; 289 default: 290 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 291 break; 292 } 293 PetscFunctionReturn(0); 294 } 295 296 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 297 298 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 299 { 300 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 301 IS isrow = b->row,iscol = b->col; 302 PetscBool row_identity,col_identity; 303 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 304 PetscErrorCode ierr; 305 306 PetscFunctionBegin; 307 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 308 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 309 B->offloadmask = PETSC_OFFLOAD_CPU; 310 /* determine which version of MatSolve needs to be used. */ 311 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 312 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 313 if (row_identity && col_identity) { 314 if (!cusparsestruct->use_cpu_solve) { 315 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 316 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 317 } 318 B->ops->matsolve = NULL; 319 B->ops->matsolvetranspose = NULL; 320 } else { 321 if (!cusparsestruct->use_cpu_solve) { 322 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 323 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 324 } 325 B->ops->matsolve = NULL; 326 B->ops->matsolvetranspose = NULL; 327 } 328 329 /* get the triangular factors */ 330 if (!cusparsestruct->use_cpu_solve) { 331 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 332 } 333 PetscFunctionReturn(0); 334 } 335 336 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 337 { 338 PetscErrorCode ierr; 339 MatCUSPARSEStorageFormat format; 340 PetscBool flg; 341 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 342 343 PetscFunctionBegin; 344 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 345 if (A->factortype == MAT_FACTOR_NONE) { 346 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 347 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 348 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 349 350 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 351 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 352 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 353 ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 354 if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 355 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 356 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 357 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 358 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 359 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 360 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 361 #else 362 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 363 #endif 364 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 365 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 366 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 367 368 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 369 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 370 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 371 #endif 372 } 373 ierr = PetscOptionsTail();CHKERRQ(ierr); 374 PetscFunctionReturn(0); 375 } 376 377 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 378 { 379 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 384 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 385 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 386 PetscFunctionReturn(0); 387 } 388 389 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 390 { 391 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 392 PetscErrorCode ierr; 393 394 PetscFunctionBegin; 395 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 396 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 397 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 398 PetscFunctionReturn(0); 399 } 400 401 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 402 { 403 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 404 PetscErrorCode ierr; 405 406 PetscFunctionBegin; 407 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 408 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 409 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 410 PetscFunctionReturn(0); 411 } 412 413 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 414 { 415 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 416 PetscErrorCode ierr; 417 418 PetscFunctionBegin; 419 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 420 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 421 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 422 PetscFunctionReturn(0); 423 } 424 425 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 426 { 427 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 428 PetscInt n = A->rmap->n; 429 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 430 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 431 cusparseStatus_t stat; 432 const PetscInt *ai = a->i,*aj = a->j,*vi; 433 const MatScalar *aa = a->a,*v; 434 PetscInt *AiLo, *AjLo; 435 PetscInt i,nz, nzLower, offset, rowOffset; 436 PetscErrorCode ierr; 437 cudaError_t cerr; 438 439 PetscFunctionBegin; 440 if (!n) PetscFunctionReturn(0); 441 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 442 try { 443 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 444 nzLower=n+ai[n]-ai[1]; 445 if (!loTriFactor) { 446 PetscScalar *AALo; 447 448 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 449 450 /* Allocate Space for the lower triangular matrix */ 451 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 452 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 453 454 /* Fill the lower triangular matrix */ 455 AiLo[0] = (PetscInt) 0; 456 AiLo[n] = nzLower; 457 AjLo[0] = (PetscInt) 0; 458 AALo[0] = (MatScalar) 1.0; 459 v = aa; 460 vi = aj; 461 offset = 1; 462 rowOffset= 1; 463 for (i=1; i<n; i++) { 464 nz = ai[i+1] - ai[i]; 465 /* additional 1 for the term on the diagonal */ 466 AiLo[i] = rowOffset; 467 rowOffset += nz+1; 468 469 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 470 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 471 472 offset += nz; 473 AjLo[offset] = (PetscInt) i; 474 AALo[offset] = (MatScalar) 1.0; 475 offset += 1; 476 477 v += nz; 478 vi += nz; 479 } 480 481 /* allocate space for the triangular factor information */ 482 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 483 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 484 /* Create the matrix description */ 485 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 486 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 487 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 488 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 489 #else 490 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 491 #endif 492 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 493 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 494 495 /* set the operation */ 496 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 497 498 /* set the matrix */ 499 loTriFactor->csrMat = new CsrMatrix; 500 loTriFactor->csrMat->num_rows = n; 501 loTriFactor->csrMat->num_cols = n; 502 loTriFactor->csrMat->num_entries = nzLower; 503 504 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 505 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 506 507 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 508 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 509 510 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 511 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 512 513 /* Create the solve analysis information */ 514 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 515 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 516 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 517 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 518 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 519 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 520 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 521 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 522 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 523 #endif 524 525 /* perform the solve analysis */ 526 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 527 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 528 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 529 loTriFactor->csrMat->column_indices->data().get(), 530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 531 loTriFactor->solveInfo, 532 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 533 #else 534 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 535 #endif 536 cerr = WaitForCUDA();CHKERRCUDA(cerr); 537 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 538 539 /* assign the pointer */ 540 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 541 loTriFactor->AA_h = AALo; 542 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 543 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 544 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 545 } else { /* update values only */ 546 if (!loTriFactor->AA_h) { 547 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 548 } 549 /* Fill the lower triangular matrix */ 550 loTriFactor->AA_h[0] = 1.0; 551 v = aa; 552 vi = aj; 553 offset = 1; 554 for (i=1; i<n; i++) { 555 nz = ai[i+1] - ai[i]; 556 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 557 offset += nz; 558 loTriFactor->AA_h[offset] = 1.0; 559 offset += 1; 560 v += nz; 561 } 562 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 563 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 564 } 565 } catch(char *ex) { 566 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 567 } 568 } 569 PetscFunctionReturn(0); 570 } 571 572 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 573 { 574 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 575 PetscInt n = A->rmap->n; 576 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 577 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 578 cusparseStatus_t stat; 579 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 580 const MatScalar *aa = a->a,*v; 581 PetscInt *AiUp, *AjUp; 582 PetscInt i,nz, nzUpper, offset; 583 PetscErrorCode ierr; 584 cudaError_t cerr; 585 586 PetscFunctionBegin; 587 if (!n) PetscFunctionReturn(0); 588 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 589 try { 590 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 591 nzUpper = adiag[0]-adiag[n]; 592 if (!upTriFactor) { 593 PetscScalar *AAUp; 594 595 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 596 597 /* Allocate Space for the upper triangular matrix */ 598 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 599 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 600 601 /* Fill the upper triangular matrix */ 602 AiUp[0]=(PetscInt) 0; 603 AiUp[n]=nzUpper; 604 offset = nzUpper; 605 for (i=n-1; i>=0; i--) { 606 v = aa + adiag[i+1] + 1; 607 vi = aj + adiag[i+1] + 1; 608 609 /* number of elements NOT on the diagonal */ 610 nz = adiag[i] - adiag[i+1]-1; 611 612 /* decrement the offset */ 613 offset -= (nz+1); 614 615 /* first, set the diagonal elements */ 616 AjUp[offset] = (PetscInt) i; 617 AAUp[offset] = (MatScalar)1./v[nz]; 618 AiUp[i] = AiUp[i+1] - (nz+1); 619 620 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 621 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 622 } 623 624 /* allocate space for the triangular factor information */ 625 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 626 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 627 628 /* Create the matrix description */ 629 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 630 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 631 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 632 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 633 #else 634 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 635 #endif 636 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 637 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 638 639 /* set the operation */ 640 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 641 642 /* set the matrix */ 643 upTriFactor->csrMat = new CsrMatrix; 644 upTriFactor->csrMat->num_rows = n; 645 upTriFactor->csrMat->num_cols = n; 646 upTriFactor->csrMat->num_entries = nzUpper; 647 648 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 649 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 650 651 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 652 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 653 654 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 655 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 656 657 /* Create the solve analysis information */ 658 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 659 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 660 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 661 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 662 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 663 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 664 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 665 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 666 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 667 #endif 668 669 /* perform the solve analysis */ 670 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 671 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 672 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 673 upTriFactor->csrMat->column_indices->data().get(), 674 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 675 upTriFactor->solveInfo, 676 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 677 #else 678 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 679 #endif 680 cerr = WaitForCUDA();CHKERRCUDA(cerr); 681 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 682 683 /* assign the pointer */ 684 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 685 upTriFactor->AA_h = AAUp; 686 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 687 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 688 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 689 } else { 690 if (!upTriFactor->AA_h) { 691 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 692 } 693 /* Fill the upper triangular matrix */ 694 offset = nzUpper; 695 for (i=n-1; i>=0; i--) { 696 v = aa + adiag[i+1] + 1; 697 698 /* number of elements NOT on the diagonal */ 699 nz = adiag[i] - adiag[i+1]-1; 700 701 /* decrement the offset */ 702 offset -= (nz+1); 703 704 /* first, set the diagonal elements */ 705 upTriFactor->AA_h[offset] = 1./v[nz]; 706 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 707 } 708 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 709 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 710 } 711 } catch(char *ex) { 712 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 713 } 714 } 715 PetscFunctionReturn(0); 716 } 717 718 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 719 { 720 PetscErrorCode ierr; 721 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 722 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 723 IS isrow = a->row,iscol = a->icol; 724 PetscBool row_identity,col_identity; 725 PetscInt n = A->rmap->n; 726 727 PetscFunctionBegin; 728 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 729 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 730 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 731 732 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 733 cusparseTriFactors->nnz=a->nz; 734 735 A->offloadmask = PETSC_OFFLOAD_BOTH; 736 /* lower triangular indices */ 737 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 738 if (!row_identity && !cusparseTriFactors->rpermIndices) { 739 const PetscInt *r; 740 741 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 742 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 743 cusparseTriFactors->rpermIndices->assign(r, r+n); 744 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 745 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 746 } 747 748 /* upper triangular indices */ 749 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 750 if (!col_identity && !cusparseTriFactors->cpermIndices) { 751 const PetscInt *c; 752 753 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 754 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 755 cusparseTriFactors->cpermIndices->assign(c, c+n); 756 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 757 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 758 } 759 PetscFunctionReturn(0); 760 } 761 762 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 763 { 764 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 765 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 766 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 767 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 768 cusparseStatus_t stat; 769 PetscErrorCode ierr; 770 cudaError_t cerr; 771 PetscInt *AiUp, *AjUp; 772 PetscScalar *AAUp; 773 PetscScalar *AALo; 774 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 775 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 776 const PetscInt *ai = b->i,*aj = b->j,*vj; 777 const MatScalar *aa = b->a,*v; 778 779 PetscFunctionBegin; 780 if (!n) PetscFunctionReturn(0); 781 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 782 try { 783 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 784 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 785 if (!upTriFactor && !loTriFactor) { 786 /* Allocate Space for the upper triangular matrix */ 787 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 788 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 789 790 /* Fill the upper triangular matrix */ 791 AiUp[0]=(PetscInt) 0; 792 AiUp[n]=nzUpper; 793 offset = 0; 794 for (i=0; i<n; i++) { 795 /* set the pointers */ 796 v = aa + ai[i]; 797 vj = aj + ai[i]; 798 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 799 800 /* first, set the diagonal elements */ 801 AjUp[offset] = (PetscInt) i; 802 AAUp[offset] = (MatScalar)1.0/v[nz]; 803 AiUp[i] = offset; 804 AALo[offset] = (MatScalar)1.0/v[nz]; 805 806 offset+=1; 807 if (nz>0) { 808 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 809 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 810 for (j=offset; j<offset+nz; j++) { 811 AAUp[j] = -AAUp[j]; 812 AALo[j] = AAUp[j]/v[nz]; 813 } 814 offset+=nz; 815 } 816 } 817 818 /* allocate space for the triangular factor information */ 819 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 820 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 821 822 /* Create the matrix description */ 823 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 824 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 825 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 826 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 827 #else 828 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 829 #endif 830 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 831 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 832 833 /* set the matrix */ 834 upTriFactor->csrMat = new CsrMatrix; 835 upTriFactor->csrMat->num_rows = A->rmap->n; 836 upTriFactor->csrMat->num_cols = A->cmap->n; 837 upTriFactor->csrMat->num_entries = a->nz; 838 839 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841 842 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844 845 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 847 848 /* set the operation */ 849 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 850 851 /* Create the solve analysis information */ 852 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 853 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 854 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 855 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 856 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 857 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 858 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 859 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 860 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 861 #endif 862 863 /* perform the solve analysis */ 864 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 865 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 866 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 867 upTriFactor->csrMat->column_indices->data().get(), 868 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 869 upTriFactor->solveInfo, 870 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 871 #else 872 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 873 #endif 874 cerr = WaitForCUDA();CHKERRCUDA(cerr); 875 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 876 877 /* assign the pointer */ 878 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 879 880 /* allocate space for the triangular factor information */ 881 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 882 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 883 884 /* Create the matrix description */ 885 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 886 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 887 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 888 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 889 #else 890 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 891 #endif 892 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 893 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 894 895 /* set the operation */ 896 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 897 898 /* set the matrix */ 899 loTriFactor->csrMat = new CsrMatrix; 900 loTriFactor->csrMat->num_rows = A->rmap->n; 901 loTriFactor->csrMat->num_cols = A->cmap->n; 902 loTriFactor->csrMat->num_entries = a->nz; 903 904 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 906 907 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 908 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 909 910 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 911 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 912 913 /* Create the solve analysis information */ 914 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 915 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 916 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 917 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 918 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 919 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 920 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 921 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 922 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 923 #endif 924 925 /* perform the solve analysis */ 926 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 927 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 928 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 929 loTriFactor->csrMat->column_indices->data().get(), 930 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 931 loTriFactor->solveInfo, 932 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 933 #else 934 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 935 #endif 936 cerr = WaitForCUDA();CHKERRCUDA(cerr); 937 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 938 939 /* assign the pointer */ 940 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 941 942 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 943 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 944 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 945 } else { 946 /* Fill the upper triangular matrix */ 947 offset = 0; 948 for (i=0; i<n; i++) { 949 /* set the pointers */ 950 v = aa + ai[i]; 951 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 952 953 /* first, set the diagonal elements */ 954 AAUp[offset] = 1.0/v[nz]; 955 AALo[offset] = 1.0/v[nz]; 956 957 offset+=1; 958 if (nz>0) { 959 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 960 for (j=offset; j<offset+nz; j++) { 961 AAUp[j] = -AAUp[j]; 962 AALo[j] = AAUp[j]/v[nz]; 963 } 964 offset+=nz; 965 } 966 } 967 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 968 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 969 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 970 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 971 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 972 } 973 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 974 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 975 } catch(char *ex) { 976 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 977 } 978 } 979 PetscFunctionReturn(0); 980 } 981 982 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 983 { 984 PetscErrorCode ierr; 985 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 986 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 987 IS ip = a->row; 988 PetscBool perm_identity; 989 PetscInt n = A->rmap->n; 990 991 PetscFunctionBegin; 992 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 993 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 994 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 995 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 996 997 A->offloadmask = PETSC_OFFLOAD_BOTH; 998 999 /* lower triangular indices */ 1000 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1001 if (!perm_identity) { 1002 IS iip; 1003 const PetscInt *irip,*rip; 1004 1005 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 1006 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1007 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1008 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1009 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1010 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1011 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 1012 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 1013 ierr = ISDestroy(&iip);CHKERRQ(ierr); 1014 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1015 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1016 } 1017 PetscFunctionReturn(0); 1018 } 1019 1020 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1021 { 1022 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1023 IS ip = b->row; 1024 PetscBool perm_identity; 1025 PetscErrorCode ierr; 1026 1027 PetscFunctionBegin; 1028 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1029 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1030 B->offloadmask = PETSC_OFFLOAD_CPU; 1031 /* determine which version of MatSolve needs to be used. */ 1032 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1033 if (perm_identity) { 1034 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1035 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1036 B->ops->matsolve = NULL; 1037 B->ops->matsolvetranspose = NULL; 1038 } else { 1039 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1040 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1041 B->ops->matsolve = NULL; 1042 B->ops->matsolvetranspose = NULL; 1043 } 1044 1045 /* get the triangular factors */ 1046 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1047 PetscFunctionReturn(0); 1048 } 1049 1050 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1051 { 1052 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1054 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1055 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1056 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1057 cusparseStatus_t stat; 1058 cusparseIndexBase_t indexBase; 1059 cusparseMatrixType_t matrixType; 1060 cusparseFillMode_t fillMode; 1061 cusparseDiagType_t diagType; 1062 cudaError_t cerr; 1063 PetscErrorCode ierr; 1064 1065 PetscFunctionBegin; 1066 /* allocate space for the transpose of the lower triangular factor */ 1067 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1068 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1069 1070 /* set the matrix descriptors of the lower triangular factor */ 1071 matrixType = cusparseGetMatType(loTriFactor->descr); 1072 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1073 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1074 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1075 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1076 1077 /* Create the matrix description */ 1078 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1079 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1080 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1081 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1082 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1083 1084 /* set the operation */ 1085 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1086 1087 /* allocate GPU space for the CSC of the lower triangular factor*/ 1088 loTriFactorT->csrMat = new CsrMatrix; 1089 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1090 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1091 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1092 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1093 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1094 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1095 1096 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1097 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1098 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1099 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1100 loTriFactor->csrMat->values->data().get(), 1101 loTriFactor->csrMat->row_offsets->data().get(), 1102 loTriFactor->csrMat->column_indices->data().get(), 1103 loTriFactorT->csrMat->values->data().get(), 1104 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1105 CUSPARSE_ACTION_NUMERIC,indexBase, 1106 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1107 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1108 #endif 1109 1110 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1111 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1112 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1113 loTriFactor->csrMat->values->data().get(), 1114 loTriFactor->csrMat->row_offsets->data().get(), 1115 loTriFactor->csrMat->column_indices->data().get(), 1116 loTriFactorT->csrMat->values->data().get(), 1117 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1118 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1119 CUSPARSE_ACTION_NUMERIC, indexBase, 1120 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1121 #else 1122 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1123 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1124 #endif 1125 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1126 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1127 1128 /* Create the solve analysis information */ 1129 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1130 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1131 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1132 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1133 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1134 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1135 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1136 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1137 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1138 #endif 1139 1140 /* perform the solve analysis */ 1141 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1142 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1143 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1144 loTriFactorT->csrMat->column_indices->data().get(), 1145 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1146 loTriFactorT->solveInfo, 1147 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1148 #else 1149 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1150 #endif 1151 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1152 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1153 1154 /* assign the pointer */ 1155 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1156 1157 /*********************************************/ 1158 /* Now the Transpose of the Upper Tri Factor */ 1159 /*********************************************/ 1160 1161 /* allocate space for the transpose of the upper triangular factor */ 1162 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1163 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1164 1165 /* set the matrix descriptors of the upper triangular factor */ 1166 matrixType = cusparseGetMatType(upTriFactor->descr); 1167 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1168 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1169 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1170 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1171 1172 /* Create the matrix description */ 1173 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1174 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1175 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1176 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1177 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1178 1179 /* set the operation */ 1180 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1181 1182 /* allocate GPU space for the CSC of the upper triangular factor*/ 1183 upTriFactorT->csrMat = new CsrMatrix; 1184 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1185 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1186 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1187 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1188 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1189 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1190 1191 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1192 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1193 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1194 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1195 upTriFactor->csrMat->values->data().get(), 1196 upTriFactor->csrMat->row_offsets->data().get(), 1197 upTriFactor->csrMat->column_indices->data().get(), 1198 upTriFactorT->csrMat->values->data().get(), 1199 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1200 CUSPARSE_ACTION_NUMERIC,indexBase, 1201 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1202 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1203 #endif 1204 1205 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1206 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1207 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1208 upTriFactor->csrMat->values->data().get(), 1209 upTriFactor->csrMat->row_offsets->data().get(), 1210 upTriFactor->csrMat->column_indices->data().get(), 1211 upTriFactorT->csrMat->values->data().get(), 1212 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1213 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1214 CUSPARSE_ACTION_NUMERIC, indexBase, 1215 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1216 #else 1217 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1218 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1219 #endif 1220 1221 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1222 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1223 1224 /* Create the solve analysis information */ 1225 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1226 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1227 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1228 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1229 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1230 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1231 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1232 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1233 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1234 #endif 1235 1236 /* perform the solve analysis */ 1237 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1238 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1239 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1240 upTriFactorT->csrMat->column_indices->data().get(), 1241 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1242 upTriFactorT->solveInfo, 1243 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1244 #else 1245 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1246 #endif 1247 1248 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1249 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1250 1251 /* assign the pointer */ 1252 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1253 PetscFunctionReturn(0); 1254 } 1255 1256 struct PetscScalarToPetscInt 1257 { 1258 __host__ __device__ 1259 PetscInt operator()(PetscScalar s) 1260 { 1261 return (PetscInt)PetscRealPart(s); 1262 } 1263 }; 1264 1265 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1266 { 1267 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1268 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1269 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1270 cusparseStatus_t stat; 1271 cusparseIndexBase_t indexBase; 1272 cudaError_t err; 1273 PetscErrorCode ierr; 1274 1275 PetscFunctionBegin; 1276 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1277 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1278 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1279 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1280 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1281 if (A->transupdated) PetscFunctionReturn(0); 1282 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1283 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1284 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1285 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1286 } 1287 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1288 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1289 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1290 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1291 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1292 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1293 1294 /* set alpha and beta */ 1295 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1296 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1297 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1298 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1299 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1300 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1301 1302 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1303 CsrMatrix *matrixT = new CsrMatrix; 1304 matstructT->mat = matrixT; 1305 matrixT->num_rows = A->cmap->n; 1306 matrixT->num_cols = A->rmap->n; 1307 matrixT->num_entries = a->nz; 1308 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1309 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1310 matrixT->values = new THRUSTARRAY(a->nz); 1311 1312 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1313 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1314 1315 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1316 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1317 stat = cusparseCreateCsr(&matstructT->matDescr, 1318 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1319 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1320 matrixT->values->data().get(), 1321 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1322 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1323 #else 1324 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1325 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1326 1327 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1328 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1329 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1330 */ 1331 if (matrixT->num_entries) { 1332 stat = cusparseCreateCsr(&matstructT->matDescr, 1333 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1334 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1335 matrixT->values->data().get(), 1336 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1337 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1338 1339 } else { 1340 matstructT->matDescr = NULL; 1341 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1342 } 1343 #endif 1344 #endif 1345 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1346 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1347 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1348 #else 1349 CsrMatrix *temp = new CsrMatrix; 1350 CsrMatrix *tempT = new CsrMatrix; 1351 /* First convert HYB to CSR */ 1352 temp->num_rows = A->rmap->n; 1353 temp->num_cols = A->cmap->n; 1354 temp->num_entries = a->nz; 1355 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1356 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1357 temp->values = new THRUSTARRAY(a->nz); 1358 1359 stat = cusparse_hyb2csr(cusparsestruct->handle, 1360 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1361 temp->values->data().get(), 1362 temp->row_offsets->data().get(), 1363 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1364 1365 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1366 tempT->num_rows = A->rmap->n; 1367 tempT->num_cols = A->cmap->n; 1368 tempT->num_entries = a->nz; 1369 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1370 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1371 tempT->values = new THRUSTARRAY(a->nz); 1372 1373 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1374 temp->num_cols, temp->num_entries, 1375 temp->values->data().get(), 1376 temp->row_offsets->data().get(), 1377 temp->column_indices->data().get(), 1378 tempT->values->data().get(), 1379 tempT->column_indices->data().get(), 1380 tempT->row_offsets->data().get(), 1381 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1382 1383 /* Last, convert CSC to HYB */ 1384 cusparseHybMat_t hybMat; 1385 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1386 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1387 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1388 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1389 matstructT->descr, tempT->values->data().get(), 1390 tempT->row_offsets->data().get(), 1391 tempT->column_indices->data().get(), 1392 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1393 1394 /* assign the pointer */ 1395 matstructT->mat = hybMat; 1396 A->transupdated = PETSC_TRUE; 1397 /* delete temporaries */ 1398 if (tempT) { 1399 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1400 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1401 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1402 delete (CsrMatrix*) tempT; 1403 } 1404 if (temp) { 1405 if (temp->values) delete (THRUSTARRAY*) temp->values; 1406 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1407 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1408 delete (CsrMatrix*) temp; 1409 } 1410 #endif 1411 } 1412 } 1413 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1414 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1415 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1416 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1417 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1418 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1419 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1420 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1421 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1422 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1423 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1424 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1425 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1426 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1427 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1428 } 1429 if (!cusparsestruct->csr2csc_i) { 1430 THRUSTARRAY csr2csc_a(matrix->num_entries); 1431 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1432 1433 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1434 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1435 void *csr2cscBuffer; 1436 size_t csr2cscBufferSize; 1437 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1438 A->cmap->n, matrix->num_entries, 1439 matrix->values->data().get(), 1440 cusparsestruct->rowoffsets_gpu->data().get(), 1441 matrix->column_indices->data().get(), 1442 matrixT->values->data().get(), 1443 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1444 CUSPARSE_ACTION_NUMERIC,indexBase, 1445 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1446 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1447 #endif 1448 1449 if (matrix->num_entries) { 1450 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1451 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1452 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1453 1454 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1455 should be filled with indexBase. So I just take a shortcut here. 1456 */ 1457 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1458 A->cmap->n,matrix->num_entries, 1459 csr2csc_a.data().get(), 1460 cusparsestruct->rowoffsets_gpu->data().get(), 1461 matrix->column_indices->data().get(), 1462 matrixT->values->data().get(), 1463 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1464 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1465 CUSPARSE_ACTION_NUMERIC,indexBase, 1466 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1467 #else 1468 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1469 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1470 #endif 1471 } else { 1472 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1473 } 1474 1475 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1476 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1477 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1478 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1479 #endif 1480 } 1481 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1482 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1483 matrixT->values->begin())); 1484 } 1485 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1486 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1487 /* the compressed row indices is not used for matTranspose */ 1488 matstructT->cprowIndices = NULL; 1489 /* assign the pointer */ 1490 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1491 A->transupdated = PETSC_TRUE; 1492 PetscFunctionReturn(0); 1493 } 1494 1495 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1496 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1497 { 1498 PetscInt n = xx->map->n; 1499 const PetscScalar *barray; 1500 PetscScalar *xarray; 1501 thrust::device_ptr<const PetscScalar> bGPU; 1502 thrust::device_ptr<PetscScalar> xGPU; 1503 cusparseStatus_t stat; 1504 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1505 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1506 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1507 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1508 PetscErrorCode ierr; 1509 1510 PetscFunctionBegin; 1511 /* Analyze the matrix and create the transpose ... on the fly */ 1512 if (!loTriFactorT && !upTriFactorT) { 1513 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1514 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1515 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1516 } 1517 1518 /* Get the GPU pointers */ 1519 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1520 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1521 xGPU = thrust::device_pointer_cast(xarray); 1522 bGPU = thrust::device_pointer_cast(barray); 1523 1524 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1525 /* First, reorder with the row permutation */ 1526 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1527 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1528 xGPU); 1529 1530 /* First, solve U */ 1531 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1532 upTriFactorT->csrMat->num_rows, 1533 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1534 upTriFactorT->csrMat->num_entries, 1535 #endif 1536 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1537 upTriFactorT->csrMat->values->data().get(), 1538 upTriFactorT->csrMat->row_offsets->data().get(), 1539 upTriFactorT->csrMat->column_indices->data().get(), 1540 upTriFactorT->solveInfo, 1541 xarray, 1542 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543 tempGPU->data().get(), 1544 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1545 #else 1546 tempGPU->data().get());CHKERRCUSPARSE(stat); 1547 #endif 1548 1549 /* Then, solve L */ 1550 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1551 loTriFactorT->csrMat->num_rows, 1552 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1553 loTriFactorT->csrMat->num_entries, 1554 #endif 1555 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1556 loTriFactorT->csrMat->values->data().get(), 1557 loTriFactorT->csrMat->row_offsets->data().get(), 1558 loTriFactorT->csrMat->column_indices->data().get(), 1559 loTriFactorT->solveInfo, 1560 tempGPU->data().get(), 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 xarray, 1563 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1564 #else 1565 xarray);CHKERRCUSPARSE(stat); 1566 #endif 1567 1568 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1569 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1570 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1571 tempGPU->begin()); 1572 1573 /* Copy the temporary to the full solution. */ 1574 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1575 1576 /* restore */ 1577 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1578 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1579 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1580 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1581 PetscFunctionReturn(0); 1582 } 1583 1584 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1585 { 1586 const PetscScalar *barray; 1587 PetscScalar *xarray; 1588 cusparseStatus_t stat; 1589 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1590 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1591 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1592 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1593 PetscErrorCode ierr; 1594 1595 PetscFunctionBegin; 1596 /* Analyze the matrix and create the transpose ... on the fly */ 1597 if (!loTriFactorT && !upTriFactorT) { 1598 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1599 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1600 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1601 } 1602 1603 /* Get the GPU pointers */ 1604 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1605 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1606 1607 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1608 /* First, solve U */ 1609 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1610 upTriFactorT->csrMat->num_rows, 1611 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1612 upTriFactorT->csrMat->num_entries, 1613 #endif 1614 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1615 upTriFactorT->csrMat->values->data().get(), 1616 upTriFactorT->csrMat->row_offsets->data().get(), 1617 upTriFactorT->csrMat->column_indices->data().get(), 1618 upTriFactorT->solveInfo, 1619 barray, 1620 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1621 tempGPU->data().get(), 1622 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1623 #else 1624 tempGPU->data().get());CHKERRCUSPARSE(stat); 1625 #endif 1626 1627 /* Then, solve L */ 1628 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1629 loTriFactorT->csrMat->num_rows, 1630 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1631 loTriFactorT->csrMat->num_entries, 1632 #endif 1633 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1634 loTriFactorT->csrMat->values->data().get(), 1635 loTriFactorT->csrMat->row_offsets->data().get(), 1636 loTriFactorT->csrMat->column_indices->data().get(), 1637 loTriFactorT->solveInfo, 1638 tempGPU->data().get(), 1639 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1640 xarray, 1641 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1642 #else 1643 xarray);CHKERRCUSPARSE(stat); 1644 #endif 1645 1646 /* restore */ 1647 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1648 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1649 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1650 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1651 PetscFunctionReturn(0); 1652 } 1653 1654 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1655 { 1656 const PetscScalar *barray; 1657 PetscScalar *xarray; 1658 thrust::device_ptr<const PetscScalar> bGPU; 1659 thrust::device_ptr<PetscScalar> xGPU; 1660 cusparseStatus_t stat; 1661 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1662 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1663 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1664 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1665 PetscErrorCode ierr; 1666 1667 PetscFunctionBegin; 1668 1669 /* Get the GPU pointers */ 1670 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1671 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1672 xGPU = thrust::device_pointer_cast(xarray); 1673 bGPU = thrust::device_pointer_cast(barray); 1674 1675 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1676 /* First, reorder with the row permutation */ 1677 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1678 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1679 tempGPU->begin()); 1680 1681 /* Next, solve L */ 1682 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1683 loTriFactor->csrMat->num_rows, 1684 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1685 loTriFactor->csrMat->num_entries, 1686 #endif 1687 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1688 loTriFactor->csrMat->values->data().get(), 1689 loTriFactor->csrMat->row_offsets->data().get(), 1690 loTriFactor->csrMat->column_indices->data().get(), 1691 loTriFactor->solveInfo, 1692 tempGPU->data().get(), 1693 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1694 xarray, 1695 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1696 #else 1697 xarray);CHKERRCUSPARSE(stat); 1698 #endif 1699 1700 /* Then, solve U */ 1701 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1702 upTriFactor->csrMat->num_rows, 1703 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1704 upTriFactor->csrMat->num_entries, 1705 #endif 1706 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1707 upTriFactor->csrMat->values->data().get(), 1708 upTriFactor->csrMat->row_offsets->data().get(), 1709 upTriFactor->csrMat->column_indices->data().get(), 1710 upTriFactor->solveInfo,xarray, 1711 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1712 tempGPU->data().get(), 1713 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1714 #else 1715 tempGPU->data().get());CHKERRCUSPARSE(stat); 1716 #endif 1717 1718 /* Last, reorder with the column permutation */ 1719 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1720 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1721 xGPU); 1722 1723 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1724 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1725 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1726 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1727 PetscFunctionReturn(0); 1728 } 1729 1730 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1731 { 1732 const PetscScalar *barray; 1733 PetscScalar *xarray; 1734 cusparseStatus_t stat; 1735 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1736 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1737 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1738 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1739 PetscErrorCode ierr; 1740 1741 PetscFunctionBegin; 1742 /* Get the GPU pointers */ 1743 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1744 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1745 1746 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1747 /* First, solve L */ 1748 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1749 loTriFactor->csrMat->num_rows, 1750 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1751 loTriFactor->csrMat->num_entries, 1752 #endif 1753 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1754 loTriFactor->csrMat->values->data().get(), 1755 loTriFactor->csrMat->row_offsets->data().get(), 1756 loTriFactor->csrMat->column_indices->data().get(), 1757 loTriFactor->solveInfo, 1758 barray, 1759 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1760 tempGPU->data().get(), 1761 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1762 #else 1763 tempGPU->data().get());CHKERRCUSPARSE(stat); 1764 #endif 1765 1766 /* Next, solve U */ 1767 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1768 upTriFactor->csrMat->num_rows, 1769 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1770 upTriFactor->csrMat->num_entries, 1771 #endif 1772 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1773 upTriFactor->csrMat->values->data().get(), 1774 upTriFactor->csrMat->row_offsets->data().get(), 1775 upTriFactor->csrMat->column_indices->data().get(), 1776 upTriFactor->solveInfo, 1777 tempGPU->data().get(), 1778 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1779 xarray, 1780 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1781 #else 1782 xarray);CHKERRCUSPARSE(stat); 1783 #endif 1784 1785 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1786 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1787 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1788 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1789 PetscFunctionReturn(0); 1790 } 1791 1792 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1793 { 1794 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1795 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1796 cudaError_t cerr; 1797 PetscErrorCode ierr; 1798 1799 PetscFunctionBegin; 1800 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1801 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1802 1803 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1804 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1805 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1806 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1807 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1808 A->offloadmask = PETSC_OFFLOAD_BOTH; 1809 } 1810 PetscFunctionReturn(0); 1811 } 1812 1813 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1814 { 1815 PetscErrorCode ierr; 1816 1817 PetscFunctionBegin; 1818 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1819 *array = ((Mat_SeqAIJ*)A->data)->a; 1820 PetscFunctionReturn(0); 1821 } 1822 1823 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1824 { 1825 PetscFunctionBegin; 1826 A->offloadmask = PETSC_OFFLOAD_CPU; 1827 *array = NULL; 1828 PetscFunctionReturn(0); 1829 } 1830 1831 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1832 { 1833 PetscErrorCode ierr; 1834 1835 PetscFunctionBegin; 1836 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1837 *array = ((Mat_SeqAIJ*)A->data)->a; 1838 PetscFunctionReturn(0); 1839 } 1840 1841 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1842 { 1843 PetscFunctionBegin; 1844 *array = NULL; 1845 PetscFunctionReturn(0); 1846 } 1847 1848 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1849 { 1850 PetscFunctionBegin; 1851 *array = ((Mat_SeqAIJ*)A->data)->a; 1852 PetscFunctionReturn(0); 1853 } 1854 1855 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1856 { 1857 PetscFunctionBegin; 1858 A->offloadmask = PETSC_OFFLOAD_CPU; 1859 *array = NULL; 1860 PetscFunctionReturn(0); 1861 } 1862 1863 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1864 { 1865 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1866 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1867 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1868 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1869 PetscErrorCode ierr; 1870 cusparseStatus_t stat; 1871 PetscBool both = PETSC_TRUE; 1872 cudaError_t err; 1873 1874 PetscFunctionBegin; 1875 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1876 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1877 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1878 CsrMatrix *matrix; 1879 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1880 1881 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1882 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1883 matrix->values->assign(a->a, a->a+a->nz); 1884 err = WaitForCUDA();CHKERRCUDA(err); 1885 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1886 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1887 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1888 } else { 1889 PetscInt nnz; 1890 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1891 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1892 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1893 delete cusparsestruct->workVector; 1894 delete cusparsestruct->rowoffsets_gpu; 1895 cusparsestruct->workVector = NULL; 1896 cusparsestruct->rowoffsets_gpu = NULL; 1897 try { 1898 if (a->compressedrow.use) { 1899 m = a->compressedrow.nrows; 1900 ii = a->compressedrow.i; 1901 ridx = a->compressedrow.rindex; 1902 } else { 1903 m = A->rmap->n; 1904 ii = a->i; 1905 ridx = NULL; 1906 } 1907 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1908 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1909 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1910 else nnz = a->nz; 1911 1912 /* create cusparse matrix */ 1913 cusparsestruct->nrows = m; 1914 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1915 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1916 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1917 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1918 1919 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1920 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1921 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1922 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1923 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1924 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1925 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1926 1927 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1928 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1929 /* set the matrix */ 1930 CsrMatrix *mat= new CsrMatrix; 1931 mat->num_rows = m; 1932 mat->num_cols = A->cmap->n; 1933 mat->num_entries = nnz; 1934 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1935 mat->row_offsets->assign(ii, ii + m+1); 1936 1937 mat->column_indices = new THRUSTINTARRAY32(nnz); 1938 mat->column_indices->assign(a->j, a->j+nnz); 1939 1940 mat->values = new THRUSTARRAY(nnz); 1941 if (a->a) mat->values->assign(a->a, a->a+nnz); 1942 1943 /* assign the pointer */ 1944 matstruct->mat = mat; 1945 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1946 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1947 stat = cusparseCreateCsr(&matstruct->matDescr, 1948 mat->num_rows, mat->num_cols, mat->num_entries, 1949 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1950 mat->values->data().get(), 1951 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1952 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1953 } 1954 #endif 1955 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1956 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1957 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1958 #else 1959 CsrMatrix *mat= new CsrMatrix; 1960 mat->num_rows = m; 1961 mat->num_cols = A->cmap->n; 1962 mat->num_entries = nnz; 1963 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1964 mat->row_offsets->assign(ii, ii + m+1); 1965 1966 mat->column_indices = new THRUSTINTARRAY32(nnz); 1967 mat->column_indices->assign(a->j, a->j+nnz); 1968 1969 mat->values = new THRUSTARRAY(nnz); 1970 if (a->a) mat->values->assign(a->a, a->a+nnz); 1971 1972 cusparseHybMat_t hybMat; 1973 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1974 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1975 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1976 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1977 matstruct->descr, mat->values->data().get(), 1978 mat->row_offsets->data().get(), 1979 mat->column_indices->data().get(), 1980 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1981 /* assign the pointer */ 1982 matstruct->mat = hybMat; 1983 1984 if (mat) { 1985 if (mat->values) delete (THRUSTARRAY*)mat->values; 1986 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1987 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1988 delete (CsrMatrix*)mat; 1989 } 1990 #endif 1991 } 1992 1993 /* assign the compressed row indices */ 1994 if (a->compressedrow.use) { 1995 cusparsestruct->workVector = new THRUSTARRAY(m); 1996 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1997 matstruct->cprowIndices->assign(ridx,ridx+m); 1998 tmp = m; 1999 } else { 2000 cusparsestruct->workVector = NULL; 2001 matstruct->cprowIndices = NULL; 2002 tmp = 0; 2003 } 2004 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2005 2006 /* assign the pointer */ 2007 cusparsestruct->mat = matstruct; 2008 } catch(char *ex) { 2009 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2010 } 2011 err = WaitForCUDA();CHKERRCUDA(err); 2012 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 2013 cusparsestruct->nonzerostate = A->nonzerostate; 2014 } 2015 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2016 } 2017 PetscFunctionReturn(0); 2018 } 2019 2020 struct VecCUDAPlusEquals 2021 { 2022 template <typename Tuple> 2023 __host__ __device__ 2024 void operator()(Tuple t) 2025 { 2026 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2027 } 2028 }; 2029 2030 struct VecCUDAEquals 2031 { 2032 template <typename Tuple> 2033 __host__ __device__ 2034 void operator()(Tuple t) 2035 { 2036 thrust::get<1>(t) = thrust::get<0>(t); 2037 } 2038 }; 2039 2040 struct VecCUDAEqualsReverse 2041 { 2042 template <typename Tuple> 2043 __host__ __device__ 2044 void operator()(Tuple t) 2045 { 2046 thrust::get<0>(t) = thrust::get<1>(t); 2047 } 2048 }; 2049 2050 struct MatMatCusparse { 2051 PetscBool cisdense; 2052 PetscScalar *Bt; 2053 Mat X; 2054 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2055 PetscLogDouble flops; 2056 CsrMatrix *Bcsr; 2057 2058 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2059 cusparseSpMatDescr_t matSpBDescr; 2060 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2061 cusparseDnMatDescr_t matBDescr; 2062 cusparseDnMatDescr_t matCDescr; 2063 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2064 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2065 void *dBuffer4; 2066 void *dBuffer5; 2067 #endif 2068 size_t mmBufferSize; 2069 void *mmBuffer; 2070 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2071 cusparseSpGEMMDescr_t spgemmDesc; 2072 #endif 2073 }; 2074 2075 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2076 { 2077 PetscErrorCode ierr; 2078 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2079 cudaError_t cerr; 2080 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2081 cusparseStatus_t stat; 2082 #endif 2083 2084 PetscFunctionBegin; 2085 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2086 delete mmdata->Bcsr; 2087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2088 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2089 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2090 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2091 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2092 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2093 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2094 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2095 #endif 2096 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2097 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2098 #endif 2099 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2100 ierr = PetscFree(data);CHKERRQ(ierr); 2101 PetscFunctionReturn(0); 2102 } 2103 2104 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2105 2106 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2107 { 2108 Mat_Product *product = C->product; 2109 Mat A,B; 2110 PetscInt m,n,blda,clda; 2111 PetscBool flg,biscuda; 2112 Mat_SeqAIJCUSPARSE *cusp; 2113 cusparseStatus_t stat; 2114 cusparseOperation_t opA; 2115 const PetscScalar *barray; 2116 PetscScalar *carray; 2117 PetscErrorCode ierr; 2118 MatMatCusparse *mmdata; 2119 Mat_SeqAIJCUSPARSEMultStruct *mat; 2120 CsrMatrix *csrmat; 2121 2122 PetscFunctionBegin; 2123 MatCheckProduct(C,1); 2124 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2125 mmdata = (MatMatCusparse*)product->data; 2126 A = product->A; 2127 B = product->B; 2128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2129 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2130 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2131 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2132 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2133 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2134 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2135 switch (product->type) { 2136 case MATPRODUCT_AB: 2137 case MATPRODUCT_PtAP: 2138 mat = cusp->mat; 2139 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2140 m = A->rmap->n; 2141 n = B->cmap->n; 2142 break; 2143 case MATPRODUCT_AtB: 2144 if (!A->form_explicit_transpose) { 2145 mat = cusp->mat; 2146 opA = CUSPARSE_OPERATION_TRANSPOSE; 2147 } else { 2148 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2149 mat = cusp->matTranspose; 2150 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2151 } 2152 m = A->cmap->n; 2153 n = B->cmap->n; 2154 break; 2155 case MATPRODUCT_ABt: 2156 case MATPRODUCT_RARt: 2157 mat = cusp->mat; 2158 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2159 m = A->rmap->n; 2160 n = B->rmap->n; 2161 break; 2162 default: 2163 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2164 } 2165 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2166 csrmat = (CsrMatrix*)mat->mat; 2167 /* if the user passed a CPU matrix, copy the data to the GPU */ 2168 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2169 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2170 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2171 2172 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2173 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2174 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2175 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2176 } else { 2177 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2178 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2179 } 2180 2181 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2182 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2183 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2184 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2185 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2186 size_t mmBufferSize; 2187 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2188 if (!mmdata->matBDescr) { 2189 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2190 mmdata->Blda = blda; 2191 } 2192 2193 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2194 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2195 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2196 mmdata->Clda = clda; 2197 } 2198 2199 if (!mat->matDescr) { 2200 stat = cusparseCreateCsr(&mat->matDescr, 2201 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2202 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2203 csrmat->values->data().get(), 2204 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2205 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2206 } 2207 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2208 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2209 mmdata->matCDescr,cusparse_scalartype, 2210 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2211 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2212 cudaError_t cerr; 2213 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2214 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2215 mmdata->mmBufferSize = mmBufferSize; 2216 } 2217 mmdata->initialized = PETSC_TRUE; 2218 } else { 2219 /* to be safe, always update pointers of the mats */ 2220 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2221 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2222 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2223 } 2224 2225 /* do cusparseSpMM, which supports transpose on B */ 2226 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2227 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2228 mmdata->matCDescr,cusparse_scalartype, 2229 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2230 #else 2231 PetscInt k; 2232 /* cusparseXcsrmm does not support transpose on B */ 2233 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2234 cublasHandle_t cublasv2handle; 2235 cublasStatus_t cerr; 2236 2237 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2238 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2239 B->cmap->n,B->rmap->n, 2240 &PETSC_CUSPARSE_ONE ,barray,blda, 2241 &PETSC_CUSPARSE_ZERO,barray,blda, 2242 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2243 blda = B->cmap->n; 2244 k = B->cmap->n; 2245 } else { 2246 k = B->rmap->n; 2247 } 2248 2249 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2250 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2251 csrmat->num_entries,mat->alpha_one,mat->descr, 2252 csrmat->values->data().get(), 2253 csrmat->row_offsets->data().get(), 2254 csrmat->column_indices->data().get(), 2255 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2256 carray,clda);CHKERRCUSPARSE(stat); 2257 #endif 2258 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2259 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2260 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2261 if (product->type == MATPRODUCT_RARt) { 2262 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2263 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2264 } else if (product->type == MATPRODUCT_PtAP) { 2265 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2266 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2267 } else { 2268 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2269 } 2270 if (mmdata->cisdense) { 2271 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2272 } 2273 if (!biscuda) { 2274 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2275 } 2276 PetscFunctionReturn(0); 2277 } 2278 2279 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2280 { 2281 Mat_Product *product = C->product; 2282 Mat A,B; 2283 PetscInt m,n; 2284 PetscBool cisdense,flg; 2285 PetscErrorCode ierr; 2286 MatMatCusparse *mmdata; 2287 Mat_SeqAIJCUSPARSE *cusp; 2288 2289 PetscFunctionBegin; 2290 MatCheckProduct(C,1); 2291 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2292 A = product->A; 2293 B = product->B; 2294 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2295 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2296 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2297 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2298 switch (product->type) { 2299 case MATPRODUCT_AB: 2300 m = A->rmap->n; 2301 n = B->cmap->n; 2302 break; 2303 case MATPRODUCT_AtB: 2304 m = A->cmap->n; 2305 n = B->cmap->n; 2306 break; 2307 case MATPRODUCT_ABt: 2308 m = A->rmap->n; 2309 n = B->rmap->n; 2310 break; 2311 case MATPRODUCT_PtAP: 2312 m = B->cmap->n; 2313 n = B->cmap->n; 2314 break; 2315 case MATPRODUCT_RARt: 2316 m = B->rmap->n; 2317 n = B->rmap->n; 2318 break; 2319 default: 2320 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2321 } 2322 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2323 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2324 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2325 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2326 2327 /* product data */ 2328 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2329 mmdata->cisdense = cisdense; 2330 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2331 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2332 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2333 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2334 } 2335 #endif 2336 /* for these products we need intermediate storage */ 2337 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2338 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2339 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2340 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2341 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2342 } else { 2343 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2344 } 2345 } 2346 C->product->data = mmdata; 2347 C->product->destroy = MatDestroy_MatMatCusparse; 2348 2349 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2350 PetscFunctionReturn(0); 2351 } 2352 2353 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2354 { 2355 Mat_Product *product = C->product; 2356 Mat A,B; 2357 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2358 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2359 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2360 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2361 PetscBool flg; 2362 PetscErrorCode ierr; 2363 cusparseStatus_t stat; 2364 cudaError_t cerr; 2365 MatProductType ptype; 2366 MatMatCusparse *mmdata; 2367 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2368 cusparseSpMatDescr_t BmatSpDescr; 2369 #endif 2370 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2371 2372 PetscFunctionBegin; 2373 MatCheckProduct(C,1); 2374 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2375 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2376 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2377 mmdata = (MatMatCusparse*)C->product->data; 2378 A = product->A; 2379 B = product->B; 2380 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2381 mmdata->reusesym = PETSC_FALSE; 2382 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2383 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2384 Cmat = Ccusp->mat; 2385 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2386 Ccsr = (CsrMatrix*)Cmat->mat; 2387 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2388 goto finalize; 2389 } 2390 if (!c->nz) goto finalize; 2391 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2392 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2393 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2394 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2395 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2396 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2397 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2398 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2399 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2400 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2401 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2402 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2403 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2404 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2405 2406 ptype = product->type; 2407 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2408 ptype = MATPRODUCT_AB; 2409 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2410 } 2411 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2412 ptype = MATPRODUCT_AB; 2413 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2414 } 2415 switch (ptype) { 2416 case MATPRODUCT_AB: 2417 Amat = Acusp->mat; 2418 Bmat = Bcusp->mat; 2419 break; 2420 case MATPRODUCT_AtB: 2421 Amat = Acusp->matTranspose; 2422 Bmat = Bcusp->mat; 2423 break; 2424 case MATPRODUCT_ABt: 2425 Amat = Acusp->mat; 2426 Bmat = Bcusp->matTranspose; 2427 break; 2428 default: 2429 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2430 } 2431 Cmat = Ccusp->mat; 2432 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2433 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2434 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2435 Acsr = (CsrMatrix*)Amat->mat; 2436 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2437 Ccsr = (CsrMatrix*)Cmat->mat; 2438 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2439 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2440 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2441 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2442 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2443 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2444 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2445 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2446 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2447 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2448 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2449 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2450 #else 2451 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2452 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2453 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2454 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2455 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2456 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2457 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2458 #endif 2459 #else 2460 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2461 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2462 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2463 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2464 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2465 #endif 2466 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2467 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2468 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2469 C->offloadmask = PETSC_OFFLOAD_GPU; 2470 finalize: 2471 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2472 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2473 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2474 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2475 c->reallocs = 0; 2476 C->info.mallocs += 0; 2477 C->info.nz_unneeded = 0; 2478 C->assembled = C->was_assembled = PETSC_TRUE; 2479 C->num_ass++; 2480 PetscFunctionReturn(0); 2481 } 2482 2483 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2484 { 2485 Mat_Product *product = C->product; 2486 Mat A,B; 2487 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2488 Mat_SeqAIJ *a,*b,*c; 2489 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2490 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2491 PetscInt i,j,m,n,k; 2492 PetscBool flg; 2493 PetscErrorCode ierr; 2494 cusparseStatus_t stat; 2495 cudaError_t cerr; 2496 MatProductType ptype; 2497 MatMatCusparse *mmdata; 2498 PetscLogDouble flops; 2499 PetscBool biscompressed,ciscompressed; 2500 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2501 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2502 cusparseSpMatDescr_t BmatSpDescr; 2503 #else 2504 int cnz; 2505 #endif 2506 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2507 2508 PetscFunctionBegin; 2509 MatCheckProduct(C,1); 2510 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2511 A = product->A; 2512 B = product->B; 2513 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2514 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2515 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2516 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2517 a = (Mat_SeqAIJ*)A->data; 2518 b = (Mat_SeqAIJ*)B->data; 2519 /* product data */ 2520 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2521 C->product->data = mmdata; 2522 C->product->destroy = MatDestroy_MatMatCusparse; 2523 2524 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2525 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2526 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2527 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2528 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2529 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2530 2531 ptype = product->type; 2532 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2533 ptype = MATPRODUCT_AB; 2534 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2535 } 2536 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2537 ptype = MATPRODUCT_AB; 2538 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2539 } 2540 biscompressed = PETSC_FALSE; 2541 ciscompressed = PETSC_FALSE; 2542 switch (ptype) { 2543 case MATPRODUCT_AB: 2544 m = A->rmap->n; 2545 n = B->cmap->n; 2546 k = A->cmap->n; 2547 Amat = Acusp->mat; 2548 Bmat = Bcusp->mat; 2549 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2550 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2551 break; 2552 case MATPRODUCT_AtB: 2553 m = A->cmap->n; 2554 n = B->cmap->n; 2555 k = A->rmap->n; 2556 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2557 Amat = Acusp->matTranspose; 2558 Bmat = Bcusp->mat; 2559 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2560 break; 2561 case MATPRODUCT_ABt: 2562 m = A->rmap->n; 2563 n = B->rmap->n; 2564 k = A->cmap->n; 2565 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2566 Amat = Acusp->mat; 2567 Bmat = Bcusp->matTranspose; 2568 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2569 break; 2570 default: 2571 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2572 } 2573 2574 /* create cusparse matrix */ 2575 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2576 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2577 c = (Mat_SeqAIJ*)C->data; 2578 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2579 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2580 Ccsr = new CsrMatrix; 2581 2582 c->compressedrow.use = ciscompressed; 2583 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2584 c->compressedrow.nrows = a->compressedrow.nrows; 2585 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2586 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2587 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2588 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2589 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2590 } else { 2591 c->compressedrow.nrows = 0; 2592 c->compressedrow.i = NULL; 2593 c->compressedrow.rindex = NULL; 2594 Ccusp->workVector = NULL; 2595 Cmat->cprowIndices = NULL; 2596 } 2597 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2598 Ccusp->mat = Cmat; 2599 Ccusp->mat->mat = Ccsr; 2600 Ccsr->num_rows = Ccusp->nrows; 2601 Ccsr->num_cols = n; 2602 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2603 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2604 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2605 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2606 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2607 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2608 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2609 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2610 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2611 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2612 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2613 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2614 c->nz = 0; 2615 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2616 Ccsr->values = new THRUSTARRAY(c->nz); 2617 goto finalizesym; 2618 } 2619 2620 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2621 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2622 Acsr = (CsrMatrix*)Amat->mat; 2623 if (!biscompressed) { 2624 Bcsr = (CsrMatrix*)Bmat->mat; 2625 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2626 BmatSpDescr = Bmat->matDescr; 2627 #endif 2628 } else { /* we need to use row offsets for the full matrix */ 2629 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2630 Bcsr = new CsrMatrix; 2631 Bcsr->num_rows = B->rmap->n; 2632 Bcsr->num_cols = cBcsr->num_cols; 2633 Bcsr->num_entries = cBcsr->num_entries; 2634 Bcsr->column_indices = cBcsr->column_indices; 2635 Bcsr->values = cBcsr->values; 2636 if (!Bcusp->rowoffsets_gpu) { 2637 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2638 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2639 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2640 } 2641 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2642 mmdata->Bcsr = Bcsr; 2643 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2644 if (Bcsr->num_rows && Bcsr->num_cols) { 2645 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2646 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2647 Bcsr->values->data().get(), 2648 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2649 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2650 } 2651 BmatSpDescr = mmdata->matSpBDescr; 2652 #endif 2653 } 2654 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2655 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2656 /* precompute flops count */ 2657 if (ptype == MATPRODUCT_AB) { 2658 for (i=0, flops = 0; i<A->rmap->n; i++) { 2659 const PetscInt st = a->i[i]; 2660 const PetscInt en = a->i[i+1]; 2661 for (j=st; j<en; j++) { 2662 const PetscInt brow = a->j[j]; 2663 flops += 2.*(b->i[brow+1] - b->i[brow]); 2664 } 2665 } 2666 } else if (ptype == MATPRODUCT_AtB) { 2667 for (i=0, flops = 0; i<A->rmap->n; i++) { 2668 const PetscInt anzi = a->i[i+1] - a->i[i]; 2669 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2670 flops += (2.*anzi)*bnzi; 2671 } 2672 } else { /* TODO */ 2673 flops = 0.; 2674 } 2675 2676 mmdata->flops = flops; 2677 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2678 2679 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2680 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2681 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2682 NULL, NULL, NULL, 2683 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2684 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2685 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2686 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2687 { 2688 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2689 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2690 */ 2691 void* dBuffer1 = NULL; 2692 void* dBuffer2 = NULL; 2693 void* dBuffer3 = NULL; 2694 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2695 size_t bufferSize1 = 0; 2696 size_t bufferSize2 = 0; 2697 size_t bufferSize3 = 0; 2698 size_t bufferSize4 = 0; 2699 size_t bufferSize5 = 0; 2700 2701 /*----------------------------------------------------------------------*/ 2702 /* ask bufferSize1 bytes for external memory */ 2703 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2704 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2705 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2706 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2707 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2708 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2709 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2710 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2711 2712 /*----------------------------------------------------------------------*/ 2713 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2714 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2715 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2716 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2717 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2718 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2719 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2720 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2721 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2722 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2723 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2724 2725 /*----------------------------------------------------------------------*/ 2726 /* get matrix C non-zero entries C_nnz1 */ 2727 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2728 c->nz = (PetscInt) C_nnz1; 2729 /* allocate matrix C */ 2730 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2731 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2732 /* update matC with the new pointers */ 2733 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2734 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2735 2736 /*----------------------------------------------------------------------*/ 2737 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2738 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2739 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2740 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2741 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2742 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2743 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2744 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2745 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2746 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2747 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2748 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2749 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2750 } 2751 #else 2752 size_t bufSize2; 2753 /* ask bufferSize bytes for external memory */ 2754 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2755 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2756 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2757 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2758 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2759 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2760 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2761 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2762 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2763 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2764 /* ask bufferSize again bytes for external memory */ 2765 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2766 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2767 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2768 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2769 /* The CUSPARSE documentation is not clear, nor the API 2770 We need both buffers to perform the operations properly! 2771 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2772 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2773 is stored in the descriptor! What a messy API... */ 2774 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2775 /* compute the intermediate product of A * B */ 2776 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2777 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2778 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2779 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2780 /* get matrix C non-zero entries C_nnz1 */ 2781 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2782 c->nz = (PetscInt) C_nnz1; 2783 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2784 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2785 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2786 Ccsr->values = new THRUSTARRAY(c->nz); 2787 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2788 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2789 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2790 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2791 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2792 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2793 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2794 #else 2795 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2796 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2797 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2798 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2799 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2800 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2801 c->nz = cnz; 2802 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2803 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2804 Ccsr->values = new THRUSTARRAY(c->nz); 2805 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2806 2807 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2808 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2809 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2810 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2811 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2812 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2813 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2814 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2815 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2816 #endif 2817 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2818 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2819 finalizesym: 2820 c->singlemalloc = PETSC_FALSE; 2821 c->free_a = PETSC_TRUE; 2822 c->free_ij = PETSC_TRUE; 2823 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2824 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2825 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2826 PetscInt *d_i = c->i; 2827 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2828 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2829 ii = *Ccsr->row_offsets; 2830 jj = *Ccsr->column_indices; 2831 if (ciscompressed) d_i = c->compressedrow.i; 2832 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2833 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2834 } else { 2835 PetscInt *d_i = c->i; 2836 if (ciscompressed) d_i = c->compressedrow.i; 2837 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2838 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2839 } 2840 if (ciscompressed) { /* need to expand host row offsets */ 2841 PetscInt r = 0; 2842 c->i[0] = 0; 2843 for (k = 0; k < c->compressedrow.nrows; k++) { 2844 const PetscInt next = c->compressedrow.rindex[k]; 2845 const PetscInt old = c->compressedrow.i[k]; 2846 for (; r < next; r++) c->i[r+1] = old; 2847 } 2848 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2849 } 2850 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2851 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2852 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2853 c->maxnz = c->nz; 2854 c->nonzerorowcnt = 0; 2855 c->rmax = 0; 2856 for (k = 0; k < m; k++) { 2857 const PetscInt nn = c->i[k+1] - c->i[k]; 2858 c->ilen[k] = c->imax[k] = nn; 2859 c->nonzerorowcnt += (PetscInt)!!nn; 2860 c->rmax = PetscMax(c->rmax,nn); 2861 } 2862 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2863 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2864 Ccsr->num_entries = c->nz; 2865 2866 C->nonzerostate++; 2867 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2868 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2869 Ccusp->nonzerostate = C->nonzerostate; 2870 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2871 C->preallocated = PETSC_TRUE; 2872 C->assembled = PETSC_FALSE; 2873 C->was_assembled = PETSC_FALSE; 2874 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2875 mmdata->reusesym = PETSC_TRUE; 2876 C->offloadmask = PETSC_OFFLOAD_GPU; 2877 } 2878 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2879 PetscFunctionReturn(0); 2880 } 2881 2882 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2883 2884 /* handles sparse or dense B */ 2885 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2886 { 2887 Mat_Product *product = mat->product; 2888 PetscErrorCode ierr; 2889 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2890 2891 PetscFunctionBegin; 2892 MatCheckProduct(mat,1); 2893 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2894 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2895 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2896 } 2897 if (product->type == MATPRODUCT_ABC) { 2898 Ciscusp = PETSC_FALSE; 2899 if (!product->C->boundtocpu) { 2900 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2901 } 2902 } 2903 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2904 PetscBool usecpu = PETSC_FALSE; 2905 switch (product->type) { 2906 case MATPRODUCT_AB: 2907 if (product->api_user) { 2908 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2909 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2910 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2911 } else { 2912 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2913 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2914 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2915 } 2916 break; 2917 case MATPRODUCT_AtB: 2918 if (product->api_user) { 2919 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2920 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2921 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2922 } else { 2923 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2924 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2925 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2926 } 2927 break; 2928 case MATPRODUCT_PtAP: 2929 if (product->api_user) { 2930 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2931 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2932 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2933 } else { 2934 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2935 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2936 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2937 } 2938 break; 2939 case MATPRODUCT_RARt: 2940 if (product->api_user) { 2941 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2942 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2943 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2944 } else { 2945 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2946 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2947 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2948 } 2949 break; 2950 case MATPRODUCT_ABC: 2951 if (product->api_user) { 2952 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2953 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2954 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2955 } else { 2956 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2957 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2958 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2959 } 2960 break; 2961 default: 2962 break; 2963 } 2964 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2965 } 2966 /* dispatch */ 2967 if (isdense) { 2968 switch (product->type) { 2969 case MATPRODUCT_AB: 2970 case MATPRODUCT_AtB: 2971 case MATPRODUCT_ABt: 2972 case MATPRODUCT_PtAP: 2973 case MATPRODUCT_RARt: 2974 if (product->A->boundtocpu) { 2975 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2976 } else { 2977 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2978 } 2979 break; 2980 case MATPRODUCT_ABC: 2981 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2982 break; 2983 default: 2984 break; 2985 } 2986 } else if (Biscusp && Ciscusp) { 2987 switch (product->type) { 2988 case MATPRODUCT_AB: 2989 case MATPRODUCT_AtB: 2990 case MATPRODUCT_ABt: 2991 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2992 break; 2993 case MATPRODUCT_PtAP: 2994 case MATPRODUCT_RARt: 2995 case MATPRODUCT_ABC: 2996 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2997 break; 2998 default: 2999 break; 3000 } 3001 } else { /* fallback for AIJ */ 3002 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3003 } 3004 PetscFunctionReturn(0); 3005 } 3006 3007 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3008 { 3009 PetscErrorCode ierr; 3010 3011 PetscFunctionBegin; 3012 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3013 PetscFunctionReturn(0); 3014 } 3015 3016 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3017 { 3018 PetscErrorCode ierr; 3019 3020 PetscFunctionBegin; 3021 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3022 PetscFunctionReturn(0); 3023 } 3024 3025 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3026 { 3027 PetscErrorCode ierr; 3028 3029 PetscFunctionBegin; 3030 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3031 PetscFunctionReturn(0); 3032 } 3033 3034 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3035 { 3036 PetscErrorCode ierr; 3037 3038 PetscFunctionBegin; 3039 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3040 PetscFunctionReturn(0); 3041 } 3042 3043 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3044 { 3045 PetscErrorCode ierr; 3046 3047 PetscFunctionBegin; 3048 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3049 PetscFunctionReturn(0); 3050 } 3051 3052 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3053 { 3054 int i = blockIdx.x*blockDim.x + threadIdx.x; 3055 if (i < n) y[idx[i]] += x[i]; 3056 } 3057 3058 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3059 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3060 { 3061 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3062 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3063 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3064 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3065 PetscErrorCode ierr; 3066 cusparseStatus_t stat; 3067 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3068 PetscBool compressed; 3069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3070 PetscInt nx,ny; 3071 #endif 3072 3073 PetscFunctionBegin; 3074 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3075 if (!a->nonzerorowcnt) { 3076 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3077 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3078 PetscFunctionReturn(0); 3079 } 3080 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3081 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3082 if (!trans) { 3083 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3084 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3085 } else { 3086 if (herm || !A->form_explicit_transpose) { 3087 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3088 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3089 } else { 3090 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3091 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3092 } 3093 } 3094 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3095 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3096 3097 try { 3098 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3099 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3100 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3101 3102 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3103 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3104 /* z = A x + beta y. 3105 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3106 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3107 */ 3108 xptr = xarray; 3109 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3110 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3111 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3112 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3113 allocated to accommodate different uses. So we get the length info directly from mat. 3114 */ 3115 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3116 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3117 nx = mat->num_cols; 3118 ny = mat->num_rows; 3119 } 3120 #endif 3121 } else { 3122 /* z = A^T x + beta y 3123 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3124 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3125 */ 3126 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3127 dptr = zarray; 3128 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3129 if (compressed) { /* Scatter x to work vector */ 3130 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3131 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3132 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3133 VecCUDAEqualsReverse()); 3134 } 3135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3136 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3137 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3138 nx = mat->num_rows; 3139 ny = mat->num_cols; 3140 } 3141 #endif 3142 } 3143 3144 /* csr_spmv does y = alpha op(A) x + beta y */ 3145 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3147 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3148 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3149 cudaError_t cerr; 3150 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3151 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3152 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3153 matstruct->matDescr, 3154 matstruct->cuSpMV[opA].vecXDescr, beta, 3155 matstruct->cuSpMV[opA].vecYDescr, 3156 cusparse_scalartype, 3157 cusparsestruct->spmvAlg, 3158 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3159 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3160 3161 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3162 } else { 3163 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3164 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3165 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3166 } 3167 3168 stat = cusparseSpMV(cusparsestruct->handle, opA, 3169 matstruct->alpha_one, 3170 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3171 matstruct->cuSpMV[opA].vecXDescr, 3172 beta, 3173 matstruct->cuSpMV[opA].vecYDescr, 3174 cusparse_scalartype, 3175 cusparsestruct->spmvAlg, 3176 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3177 #else 3178 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3179 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3180 mat->num_rows, mat->num_cols, 3181 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3182 mat->values->data().get(), mat->row_offsets->data().get(), 3183 mat->column_indices->data().get(), xptr, beta, 3184 dptr);CHKERRCUSPARSE(stat); 3185 #endif 3186 } else { 3187 if (cusparsestruct->nrows) { 3188 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3189 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3190 #else 3191 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3192 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3193 matstruct->alpha_one, matstruct->descr, hybMat, 3194 xptr, beta, 3195 dptr);CHKERRCUSPARSE(stat); 3196 #endif 3197 } 3198 } 3199 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3200 3201 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3202 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3203 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3204 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3205 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3206 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3207 } 3208 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3209 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3210 } 3211 3212 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3213 if (compressed) { 3214 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3215 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3216 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3217 prevent that. So I just add a ScatterAdd kernel. 3218 */ 3219 #if 0 3220 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3221 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3222 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3223 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3224 VecCUDAPlusEquals()); 3225 #else 3226 PetscInt n = matstruct->cprowIndices->size(); 3227 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3228 #endif 3229 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3230 } 3231 } else { 3232 if (yy && yy != zz) { 3233 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3234 } 3235 } 3236 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3237 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3238 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3239 } catch(char *ex) { 3240 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3241 } 3242 if (yy) { 3243 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3244 } else { 3245 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3246 } 3247 PetscFunctionReturn(0); 3248 } 3249 3250 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3251 { 3252 PetscErrorCode ierr; 3253 3254 PetscFunctionBegin; 3255 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3256 PetscFunctionReturn(0); 3257 } 3258 3259 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3260 { 3261 PetscErrorCode ierr; 3262 PetscObjectState onnz = A->nonzerostate; 3263 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3264 3265 PetscFunctionBegin; 3266 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3267 if (onnz != A->nonzerostate && cusp->deviceMat) { 3268 cudaError_t cerr; 3269 3270 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3271 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3272 cusp->deviceMat = NULL; 3273 } 3274 PetscFunctionReturn(0); 3275 } 3276 3277 /* --------------------------------------------------------------------------------*/ 3278 /*@ 3279 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3280 (the default parallel PETSc format). This matrix will ultimately pushed down 3281 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3282 assembly performance the user should preallocate the matrix storage by setting 3283 the parameter nz (or the array nnz). By setting these parameters accurately, 3284 performance during matrix assembly can be increased by more than a factor of 50. 3285 3286 Collective 3287 3288 Input Parameters: 3289 + comm - MPI communicator, set to PETSC_COMM_SELF 3290 . m - number of rows 3291 . n - number of columns 3292 . nz - number of nonzeros per row (same for all rows) 3293 - nnz - array containing the number of nonzeros in the various rows 3294 (possibly different for each row) or NULL 3295 3296 Output Parameter: 3297 . A - the matrix 3298 3299 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3300 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3301 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3302 3303 Notes: 3304 If nnz is given then nz is ignored 3305 3306 The AIJ format (also called the Yale sparse matrix format or 3307 compressed row storage), is fully compatible with standard Fortran 77 3308 storage. That is, the stored row and column indices can begin at 3309 either one (as in Fortran) or zero. See the users' manual for details. 3310 3311 Specify the preallocated storage with either nz or nnz (not both). 3312 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3313 allocation. For large problems you MUST preallocate memory or you 3314 will get TERRIBLE performance, see the users' manual chapter on matrices. 3315 3316 By default, this format uses inodes (identical nodes) when possible, to 3317 improve numerical efficiency of matrix-vector products and solves. We 3318 search for consecutive rows with the same nonzero structure, thereby 3319 reusing matrix information to achieve increased efficiency. 3320 3321 Level: intermediate 3322 3323 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3324 @*/ 3325 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3326 { 3327 PetscErrorCode ierr; 3328 3329 PetscFunctionBegin; 3330 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3331 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3332 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3333 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3334 PetscFunctionReturn(0); 3335 } 3336 3337 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3338 { 3339 PetscErrorCode ierr; 3340 3341 PetscFunctionBegin; 3342 if (A->factortype == MAT_FACTOR_NONE) { 3343 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3344 } else { 3345 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3346 } 3347 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3348 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3349 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3350 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3351 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3357 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3358 PetscFunctionReturn(0); 3359 } 3360 3361 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3362 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3363 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3364 { 3365 PetscErrorCode ierr; 3366 3367 PetscFunctionBegin; 3368 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3369 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3370 PetscFunctionReturn(0); 3371 } 3372 3373 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3374 { 3375 PetscErrorCode ierr; 3376 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3377 Mat_SeqAIJCUSPARSE *cy; 3378 Mat_SeqAIJCUSPARSE *cx; 3379 PetscScalar *ay; 3380 const PetscScalar *ax; 3381 CsrMatrix *csry,*csrx; 3382 3383 PetscFunctionBegin; 3384 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3385 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3386 if (X->ops->axpy != Y->ops->axpy) { 3387 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3388 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3389 PetscFunctionReturn(0); 3390 } 3391 /* if we are here, it means both matrices are bound to GPU */ 3392 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3393 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3394 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3395 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3396 csry = (CsrMatrix*)cy->mat->mat; 3397 csrx = (CsrMatrix*)cx->mat->mat; 3398 /* see if we can turn this into a cublas axpy */ 3399 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3400 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3401 if (eq) { 3402 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3403 } 3404 if (eq) str = SAME_NONZERO_PATTERN; 3405 } 3406 /* spgeam is buggy with one column */ 3407 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3408 3409 if (str == SUBSET_NONZERO_PATTERN) { 3410 cusparseStatus_t stat; 3411 PetscScalar b = 1.0; 3412 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3413 size_t bufferSize; 3414 void *buffer; 3415 cudaError_t cerr; 3416 #endif 3417 3418 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3419 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3420 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3421 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3422 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3423 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3424 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3425 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3426 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3427 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3428 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3429 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3430 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3431 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3432 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3433 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3434 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3435 #else 3436 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3437 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3438 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3439 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3440 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3441 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3442 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3443 #endif 3444 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3445 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3446 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3447 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3448 } else if (str == SAME_NONZERO_PATTERN) { 3449 cublasHandle_t cublasv2handle; 3450 cublasStatus_t berr; 3451 PetscBLASInt one = 1, bnz = 1; 3452 3453 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3454 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3455 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3456 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3457 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3458 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3459 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3460 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3461 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3462 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3463 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3464 } else { 3465 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3466 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3467 } 3468 PetscFunctionReturn(0); 3469 } 3470 3471 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3472 { 3473 PetscErrorCode ierr; 3474 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3475 PetscScalar *ay; 3476 cublasHandle_t cublasv2handle; 3477 cublasStatus_t berr; 3478 PetscBLASInt one = 1, bnz = 1; 3479 3480 PetscFunctionBegin; 3481 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3482 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3483 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3484 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3485 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3486 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3487 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3488 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3489 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3490 PetscFunctionReturn(0); 3491 } 3492 3493 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3494 { 3495 PetscErrorCode ierr; 3496 PetscBool both = PETSC_FALSE; 3497 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3498 3499 PetscFunctionBegin; 3500 if (A->factortype == MAT_FACTOR_NONE) { 3501 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3502 if (spptr->mat) { 3503 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3504 if (matrix->values) { 3505 both = PETSC_TRUE; 3506 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3507 } 3508 } 3509 if (spptr->matTranspose) { 3510 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3511 if (matrix->values) { 3512 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3513 } 3514 } 3515 } 3516 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3517 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3518 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3519 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3520 else A->offloadmask = PETSC_OFFLOAD_CPU; 3521 PetscFunctionReturn(0); 3522 } 3523 3524 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3525 { 3526 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3527 PetscErrorCode ierr; 3528 3529 PetscFunctionBegin; 3530 if (A->factortype != MAT_FACTOR_NONE) { 3531 A->boundtocpu = flg; 3532 PetscFunctionReturn(0); 3533 } 3534 if (flg) { 3535 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3536 3537 A->ops->scale = MatScale_SeqAIJ; 3538 A->ops->axpy = MatAXPY_SeqAIJ; 3539 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3540 A->ops->mult = MatMult_SeqAIJ; 3541 A->ops->multadd = MatMultAdd_SeqAIJ; 3542 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3543 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3544 A->ops->multhermitiantranspose = NULL; 3545 A->ops->multhermitiantransposeadd = NULL; 3546 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3547 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3548 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3549 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3553 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3554 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3555 } else { 3556 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3557 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3558 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3559 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3560 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3561 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3562 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3563 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3564 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3565 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3566 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3567 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3568 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3569 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3570 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3571 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3572 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3573 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3574 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3575 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3576 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3577 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578 } 3579 A->boundtocpu = flg; 3580 if (flg && a->inode.size) { 3581 a->inode.use = PETSC_TRUE; 3582 } else { 3583 a->inode.use = PETSC_FALSE; 3584 } 3585 PetscFunctionReturn(0); 3586 } 3587 3588 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3589 { 3590 PetscErrorCode ierr; 3591 cusparseStatus_t stat; 3592 Mat B; 3593 3594 PetscFunctionBegin; 3595 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3596 if (reuse == MAT_INITIAL_MATRIX) { 3597 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3598 } else if (reuse == MAT_REUSE_MATRIX) { 3599 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3600 } 3601 B = *newmat; 3602 3603 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3604 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3605 3606 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3607 if (B->factortype == MAT_FACTOR_NONE) { 3608 Mat_SeqAIJCUSPARSE *spptr; 3609 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3610 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3611 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3612 spptr->format = MAT_CUSPARSE_CSR; 3613 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3614 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3615 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3616 #else 3617 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3618 #endif 3619 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3620 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3621 #endif 3622 B->spptr = spptr; 3623 } else { 3624 Mat_SeqAIJCUSPARSETriFactors *spptr; 3625 3626 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3627 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3628 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3629 B->spptr = spptr; 3630 } 3631 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3632 } 3633 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3634 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3635 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3636 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3637 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3638 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3639 3640 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3641 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3642 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3643 #if defined(PETSC_HAVE_HYPRE) 3644 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3645 #endif 3646 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 3647 PetscFunctionReturn(0); 3648 } 3649 3650 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3651 { 3652 PetscErrorCode ierr; 3653 3654 PetscFunctionBegin; 3655 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3656 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3657 PetscFunctionReturn(0); 3658 } 3659 3660 /*MC 3661 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3662 3663 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3664 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3665 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3666 3667 Options Database Keys: 3668 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3669 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3670 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3671 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3672 3673 Level: beginner 3674 3675 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3676 M*/ 3677 3678 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3679 3680 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3681 { 3682 PetscErrorCode ierr; 3683 3684 PetscFunctionBegin; 3685 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3686 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3687 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3688 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3689 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3690 3691 PetscFunctionReturn(0); 3692 } 3693 3694 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3695 { 3696 PetscErrorCode ierr; 3697 cusparseStatus_t stat; 3698 3699 PetscFunctionBegin; 3700 if (*cusparsestruct) { 3701 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3702 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3703 delete (*cusparsestruct)->workVector; 3704 delete (*cusparsestruct)->rowoffsets_gpu; 3705 delete (*cusparsestruct)->cooPerm; 3706 delete (*cusparsestruct)->cooPerm_a; 3707 delete (*cusparsestruct)->csr2csc_i; 3708 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3709 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3710 } 3711 PetscFunctionReturn(0); 3712 } 3713 3714 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3715 { 3716 PetscFunctionBegin; 3717 if (*mat) { 3718 delete (*mat)->values; 3719 delete (*mat)->column_indices; 3720 delete (*mat)->row_offsets; 3721 delete *mat; 3722 *mat = 0; 3723 } 3724 PetscFunctionReturn(0); 3725 } 3726 3727 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3728 { 3729 cusparseStatus_t stat; 3730 PetscErrorCode ierr; 3731 3732 PetscFunctionBegin; 3733 if (*trifactor) { 3734 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3735 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3736 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3737 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3738 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3739 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3740 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3741 #endif 3742 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3743 } 3744 PetscFunctionReturn(0); 3745 } 3746 3747 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3748 { 3749 CsrMatrix *mat; 3750 cusparseStatus_t stat; 3751 cudaError_t err; 3752 3753 PetscFunctionBegin; 3754 if (*matstruct) { 3755 if ((*matstruct)->mat) { 3756 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3757 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3758 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3759 #else 3760 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3761 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3762 #endif 3763 } else { 3764 mat = (CsrMatrix*)(*matstruct)->mat; 3765 CsrMatrix_Destroy(&mat); 3766 } 3767 } 3768 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3769 delete (*matstruct)->cprowIndices; 3770 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3771 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3772 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3773 3774 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3775 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3776 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3777 for (int i=0; i<3; i++) { 3778 if (mdata->cuSpMV[i].initialized) { 3779 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3780 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3781 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3782 } 3783 } 3784 #endif 3785 delete *matstruct; 3786 *matstruct = NULL; 3787 } 3788 PetscFunctionReturn(0); 3789 } 3790 3791 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3792 { 3793 PetscErrorCode ierr; 3794 3795 PetscFunctionBegin; 3796 if (*trifactors) { 3797 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3798 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3799 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3800 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3801 delete (*trifactors)->rpermIndices; 3802 delete (*trifactors)->cpermIndices; 3803 delete (*trifactors)->workVector; 3804 (*trifactors)->rpermIndices = NULL; 3805 (*trifactors)->cpermIndices = NULL; 3806 (*trifactors)->workVector = NULL; 3807 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3808 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3809 (*trifactors)->init_dev_prop = PETSC_FALSE; 3810 } 3811 PetscFunctionReturn(0); 3812 } 3813 3814 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3815 { 3816 PetscErrorCode ierr; 3817 cusparseHandle_t handle; 3818 cusparseStatus_t stat; 3819 3820 PetscFunctionBegin; 3821 if (*trifactors) { 3822 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3823 if (handle = (*trifactors)->handle) { 3824 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3825 } 3826 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3827 } 3828 PetscFunctionReturn(0); 3829 } 3830 3831 struct IJCompare 3832 { 3833 __host__ __device__ 3834 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3835 { 3836 if (t1.get<0>() < t2.get<0>()) return true; 3837 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3838 return false; 3839 } 3840 }; 3841 3842 struct IJEqual 3843 { 3844 __host__ __device__ 3845 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3846 { 3847 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3848 return true; 3849 } 3850 }; 3851 3852 struct IJDiff 3853 { 3854 __host__ __device__ 3855 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3856 { 3857 return t1 == t2 ? 0 : 1; 3858 } 3859 }; 3860 3861 struct IJSum 3862 { 3863 __host__ __device__ 3864 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3865 { 3866 return t1||t2; 3867 } 3868 }; 3869 3870 #include <thrust/iterator/discard_iterator.h> 3871 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3872 { 3873 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3874 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3875 THRUSTARRAY *cooPerm_v = NULL; 3876 thrust::device_ptr<const PetscScalar> d_v; 3877 CsrMatrix *matrix; 3878 PetscErrorCode ierr; 3879 PetscInt n; 3880 3881 PetscFunctionBegin; 3882 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3883 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3884 if (!cusp->cooPerm) { 3885 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3886 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3887 PetscFunctionReturn(0); 3888 } 3889 matrix = (CsrMatrix*)cusp->mat->mat; 3890 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3891 if (!v) { 3892 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3893 goto finalize; 3894 } 3895 n = cusp->cooPerm->size(); 3896 if (isCudaMem(v)) { 3897 d_v = thrust::device_pointer_cast(v); 3898 } else { 3899 cooPerm_v = new THRUSTARRAY(n); 3900 cooPerm_v->assign(v,v+n); 3901 d_v = cooPerm_v->data(); 3902 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3903 } 3904 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3905 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3906 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3907 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3908 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3909 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3910 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3911 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3912 */ 3913 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3914 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3915 delete cooPerm_w; 3916 } else { 3917 /* all nonzeros in d_v[] are unique entries */ 3918 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3919 matrix->values->begin())); 3920 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3921 matrix->values->end())); 3922 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3923 } 3924 } else { 3925 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3926 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3927 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3928 } else { 3929 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3930 matrix->values->begin())); 3931 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3932 matrix->values->end())); 3933 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3934 } 3935 } 3936 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3937 finalize: 3938 delete cooPerm_v; 3939 A->offloadmask = PETSC_OFFLOAD_GPU; 3940 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3941 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3942 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3943 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3944 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3945 a->reallocs = 0; 3946 A->info.mallocs += 0; 3947 A->info.nz_unneeded = 0; 3948 A->assembled = A->was_assembled = PETSC_TRUE; 3949 A->num_ass++; 3950 PetscFunctionReturn(0); 3951 } 3952 3953 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3954 { 3955 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3956 PetscErrorCode ierr; 3957 3958 PetscFunctionBegin; 3959 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3960 if (!cusp) PetscFunctionReturn(0); 3961 if (destroy) { 3962 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3963 delete cusp->csr2csc_i; 3964 cusp->csr2csc_i = NULL; 3965 } 3966 A->transupdated = PETSC_FALSE; 3967 PetscFunctionReturn(0); 3968 } 3969 3970 #include <thrust/binary_search.h> 3971 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3972 { 3973 PetscErrorCode ierr; 3974 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3975 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3976 PetscInt cooPerm_n, nzr = 0; 3977 cudaError_t cerr; 3978 3979 PetscFunctionBegin; 3980 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3981 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3982 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3983 if (n != cooPerm_n) { 3984 delete cusp->cooPerm; 3985 delete cusp->cooPerm_a; 3986 cusp->cooPerm = NULL; 3987 cusp->cooPerm_a = NULL; 3988 } 3989 if (n) { 3990 THRUSTINTARRAY d_i(n); 3991 THRUSTINTARRAY d_j(n); 3992 THRUSTINTARRAY ii(A->rmap->n); 3993 3994 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3995 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3996 3997 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3998 d_i.assign(coo_i,coo_i+n); 3999 d_j.assign(coo_j,coo_j+n); 4000 4001 /* Ex. 4002 n = 6 4003 coo_i = [3,3,1,4,1,4] 4004 coo_j = [3,2,2,5,2,6] 4005 */ 4006 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4007 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4008 4009 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4010 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4011 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4012 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4013 THRUSTINTARRAY w = d_j; 4014 4015 /* 4016 d_i = [1,1,3,3,4,4] 4017 d_j = [2,2,2,3,5,6] 4018 cooPerm = [2,4,1,0,3,5] 4019 */ 4020 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4021 4022 /* 4023 d_i = [1,3,3,4,4,x] 4024 ^ekey 4025 d_j = [2,2,3,5,6,x] 4026 ^nekye 4027 */ 4028 if (nekey == ekey) { /* all entries are unique */ 4029 delete cusp->cooPerm_a; 4030 cusp->cooPerm_a = NULL; 4031 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4032 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4033 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4034 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4035 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4036 w[0] = 0; 4037 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4038 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4039 } 4040 thrust::counting_iterator<PetscInt> search_begin(0); 4041 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4042 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4043 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4044 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4045 4046 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 4047 a->singlemalloc = PETSC_FALSE; 4048 a->free_a = PETSC_TRUE; 4049 a->free_ij = PETSC_TRUE; 4050 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4051 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4052 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4053 a->nz = a->maxnz = a->i[A->rmap->n]; 4054 a->rmax = 0; 4055 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4056 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4057 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4058 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4059 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4060 for (PetscInt i = 0; i < A->rmap->n; i++) { 4061 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4062 nzr += (PetscInt)!!(nnzr); 4063 a->ilen[i] = a->imax[i] = nnzr; 4064 a->rmax = PetscMax(a->rmax,nnzr); 4065 } 4066 a->nonzerorowcnt = nzr; 4067 A->preallocated = PETSC_TRUE; 4068 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4069 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4070 } else { 4071 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4072 } 4073 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4074 4075 /* We want to allocate the CUSPARSE struct for matvec now. 4076 The code is so convoluted now that I prefer to copy zeros */ 4077 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4078 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4079 A->offloadmask = PETSC_OFFLOAD_CPU; 4080 A->nonzerostate++; 4081 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4082 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4083 4084 A->assembled = PETSC_FALSE; 4085 A->was_assembled = PETSC_FALSE; 4086 PetscFunctionReturn(0); 4087 } 4088 4089 /*@C 4090 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4091 4092 Not collective 4093 4094 Input Parameters: 4095 + A - the matrix 4096 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4097 4098 Output Parameters: 4099 + ia - the CSR row pointers 4100 - ja - the CSR column indices 4101 4102 Level: developer 4103 4104 Notes: 4105 When compressed is true, the CSR structure does not contain empty rows 4106 4107 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4108 @*/ 4109 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4110 { 4111 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4112 CsrMatrix *csr; 4113 PetscErrorCode ierr; 4114 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4115 4116 PetscFunctionBegin; 4117 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4118 if (!i || !j) PetscFunctionReturn(0); 4119 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4120 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4121 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4122 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4123 csr = (CsrMatrix*)cusp->mat->mat; 4124 if (i) { 4125 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4126 if (!cusp->rowoffsets_gpu) { 4127 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4128 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4129 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4130 } 4131 *i = cusp->rowoffsets_gpu->data().get(); 4132 } else *i = csr->row_offsets->data().get(); 4133 } 4134 if (j) *j = csr->column_indices->data().get(); 4135 PetscFunctionReturn(0); 4136 } 4137 4138 /*@C 4139 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4140 4141 Not collective 4142 4143 Input Parameters: 4144 + A - the matrix 4145 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4146 4147 Output Parameters: 4148 + ia - the CSR row pointers 4149 - ja - the CSR column indices 4150 4151 Level: developer 4152 4153 .seealso: MatSeqAIJCUSPARSEGetIJ() 4154 @*/ 4155 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4156 { 4157 PetscFunctionBegin; 4158 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4159 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4160 if (i) *i = NULL; 4161 if (j) *j = NULL; 4162 PetscFunctionReturn(0); 4163 } 4164 4165 /*@C 4166 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4167 4168 Not Collective 4169 4170 Input Parameter: 4171 . A - a MATSEQAIJCUSPARSE matrix 4172 4173 Output Parameter: 4174 . a - pointer to the device data 4175 4176 Level: developer 4177 4178 Notes: may trigger host-device copies if up-to-date matrix data is on host 4179 4180 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4181 @*/ 4182 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4183 { 4184 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4185 CsrMatrix *csr; 4186 PetscErrorCode ierr; 4187 4188 PetscFunctionBegin; 4189 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4190 PetscValidPointer(a,2); 4191 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4192 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4193 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4194 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4195 csr = (CsrMatrix*)cusp->mat->mat; 4196 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4197 *a = csr->values->data().get(); 4198 PetscFunctionReturn(0); 4199 } 4200 4201 /*@C 4202 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4203 4204 Not Collective 4205 4206 Input Parameter: 4207 . A - a MATSEQAIJCUSPARSE matrix 4208 4209 Output Parameter: 4210 . a - pointer to the device data 4211 4212 Level: developer 4213 4214 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4215 @*/ 4216 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4217 { 4218 PetscFunctionBegin; 4219 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4220 PetscValidPointer(a,2); 4221 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4222 *a = NULL; 4223 PetscFunctionReturn(0); 4224 } 4225 4226 /*@C 4227 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4228 4229 Not Collective 4230 4231 Input Parameter: 4232 . A - a MATSEQAIJCUSPARSE matrix 4233 4234 Output Parameter: 4235 . a - pointer to the device data 4236 4237 Level: developer 4238 4239 Notes: may trigger host-device copies if up-to-date matrix data is on host 4240 4241 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4242 @*/ 4243 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4244 { 4245 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4246 CsrMatrix *csr; 4247 PetscErrorCode ierr; 4248 4249 PetscFunctionBegin; 4250 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4251 PetscValidPointer(a,2); 4252 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4253 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4254 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4255 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4256 csr = (CsrMatrix*)cusp->mat->mat; 4257 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4258 *a = csr->values->data().get(); 4259 A->offloadmask = PETSC_OFFLOAD_GPU; 4260 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4261 PetscFunctionReturn(0); 4262 } 4263 /*@C 4264 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4265 4266 Not Collective 4267 4268 Input Parameter: 4269 . A - a MATSEQAIJCUSPARSE matrix 4270 4271 Output Parameter: 4272 . a - pointer to the device data 4273 4274 Level: developer 4275 4276 .seealso: MatSeqAIJCUSPARSEGetArray() 4277 @*/ 4278 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4279 { 4280 PetscErrorCode ierr; 4281 4282 PetscFunctionBegin; 4283 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4284 PetscValidPointer(a,2); 4285 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4286 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4287 *a = NULL; 4288 PetscFunctionReturn(0); 4289 } 4290 4291 /*@C 4292 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4293 4294 Not Collective 4295 4296 Input Parameter: 4297 . A - a MATSEQAIJCUSPARSE matrix 4298 4299 Output Parameter: 4300 . a - pointer to the device data 4301 4302 Level: developer 4303 4304 Notes: does not trigger host-device copies and flags data validity on the GPU 4305 4306 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4307 @*/ 4308 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4309 { 4310 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4311 CsrMatrix *csr; 4312 PetscErrorCode ierr; 4313 4314 PetscFunctionBegin; 4315 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4316 PetscValidPointer(a,2); 4317 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4318 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4319 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4320 csr = (CsrMatrix*)cusp->mat->mat; 4321 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4322 *a = csr->values->data().get(); 4323 A->offloadmask = PETSC_OFFLOAD_GPU; 4324 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4325 PetscFunctionReturn(0); 4326 } 4327 4328 /*@C 4329 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4330 4331 Not Collective 4332 4333 Input Parameter: 4334 . A - a MATSEQAIJCUSPARSE matrix 4335 4336 Output Parameter: 4337 . a - pointer to the device data 4338 4339 Level: developer 4340 4341 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4342 @*/ 4343 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4344 { 4345 PetscErrorCode ierr; 4346 4347 PetscFunctionBegin; 4348 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4349 PetscValidPointer(a,2); 4350 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4351 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4352 *a = NULL; 4353 PetscFunctionReturn(0); 4354 } 4355 4356 struct IJCompare4 4357 { 4358 __host__ __device__ 4359 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4360 { 4361 if (t1.get<0>() < t2.get<0>()) return true; 4362 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4363 return false; 4364 } 4365 }; 4366 4367 struct Shift 4368 { 4369 int _shift; 4370 4371 Shift(int shift) : _shift(shift) {} 4372 __host__ __device__ 4373 inline int operator() (const int &c) 4374 { 4375 return c + _shift; 4376 } 4377 }; 4378 4379 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4380 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4381 { 4382 PetscErrorCode ierr; 4383 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4384 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4385 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4386 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4387 PetscInt Annz,Bnnz; 4388 cusparseStatus_t stat; 4389 PetscInt i,m,n,zero = 0; 4390 cudaError_t cerr; 4391 4392 PetscFunctionBegin; 4393 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4394 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4395 PetscValidPointer(C,4); 4396 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4397 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4398 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4399 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4400 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4401 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4402 if (reuse == MAT_INITIAL_MATRIX) { 4403 m = A->rmap->n; 4404 n = A->cmap->n + B->cmap->n; 4405 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4406 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4407 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4408 c = (Mat_SeqAIJ*)(*C)->data; 4409 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4410 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4411 Ccsr = new CsrMatrix; 4412 Cmat->cprowIndices = NULL; 4413 c->compressedrow.use = PETSC_FALSE; 4414 c->compressedrow.nrows = 0; 4415 c->compressedrow.i = NULL; 4416 c->compressedrow.rindex = NULL; 4417 Ccusp->workVector = NULL; 4418 Ccusp->nrows = m; 4419 Ccusp->mat = Cmat; 4420 Ccusp->mat->mat = Ccsr; 4421 Ccsr->num_rows = m; 4422 Ccsr->num_cols = n; 4423 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4424 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4425 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4426 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4427 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4428 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4429 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4430 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4431 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4432 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4433 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4434 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4435 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4436 4437 Acsr = (CsrMatrix*)Acusp->mat->mat; 4438 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4439 Annz = (PetscInt)Acsr->column_indices->size(); 4440 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4441 c->nz = Annz + Bnnz; 4442 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4443 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4444 Ccsr->values = new THRUSTARRAY(c->nz); 4445 Ccsr->num_entries = c->nz; 4446 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4447 if (c->nz) { 4448 auto Acoo = new THRUSTINTARRAY32(Annz); 4449 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4450 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4451 THRUSTINTARRAY32 *Aroff,*Broff; 4452 4453 if (a->compressedrow.use) { /* need full row offset */ 4454 if (!Acusp->rowoffsets_gpu) { 4455 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4456 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4457 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4458 } 4459 Aroff = Acusp->rowoffsets_gpu; 4460 } else Aroff = Acsr->row_offsets; 4461 if (b->compressedrow.use) { /* need full row offset */ 4462 if (!Bcusp->rowoffsets_gpu) { 4463 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4464 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4465 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4466 } 4467 Broff = Bcusp->rowoffsets_gpu; 4468 } else Broff = Bcsr->row_offsets; 4469 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4470 stat = cusparseXcsr2coo(Acusp->handle, 4471 Aroff->data().get(), 4472 Annz, 4473 m, 4474 Acoo->data().get(), 4475 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4476 stat = cusparseXcsr2coo(Bcusp->handle, 4477 Broff->data().get(), 4478 Bnnz, 4479 m, 4480 Bcoo->data().get(), 4481 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4482 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4483 auto Aperm = thrust::make_constant_iterator(1); 4484 auto Bperm = thrust::make_constant_iterator(0); 4485 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4486 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4487 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4488 #else 4489 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4490 auto Bcib = Bcsr->column_indices->begin(); 4491 auto Bcie = Bcsr->column_indices->end(); 4492 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4493 #endif 4494 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4495 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4496 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4497 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4498 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4499 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4500 auto p1 = Ccusp->cooPerm->begin(); 4501 auto p2 = Ccusp->cooPerm->begin(); 4502 thrust::advance(p2,Annz); 4503 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4504 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4505 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4506 #endif 4507 auto cci = thrust::make_counting_iterator(zero); 4508 auto cce = thrust::make_counting_iterator(c->nz); 4509 #if 0 //Errors on SUMMIT cuda 11.1.0 4510 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4511 #else 4512 auto pred = thrust::identity<int>(); 4513 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4514 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4515 #endif 4516 stat = cusparseXcoo2csr(Ccusp->handle, 4517 Ccoo->data().get(), 4518 c->nz, 4519 m, 4520 Ccsr->row_offsets->data().get(), 4521 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4522 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4523 delete wPerm; 4524 delete Acoo; 4525 delete Bcoo; 4526 delete Ccoo; 4527 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4528 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4529 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4530 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4531 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4532 #endif 4533 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4534 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4535 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4536 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4537 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4538 CsrMatrix *CcsrT = new CsrMatrix; 4539 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4540 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4541 4542 (*C)->form_explicit_transpose = PETSC_TRUE; 4543 (*C)->transupdated = PETSC_TRUE; 4544 Ccusp->rowoffsets_gpu = NULL; 4545 CmatT->cprowIndices = NULL; 4546 CmatT->mat = CcsrT; 4547 CcsrT->num_rows = n; 4548 CcsrT->num_cols = m; 4549 CcsrT->num_entries = c->nz; 4550 4551 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4552 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4553 CcsrT->values = new THRUSTARRAY(c->nz); 4554 4555 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4556 auto rT = CcsrT->row_offsets->begin(); 4557 if (AT) { 4558 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4559 thrust::advance(rT,-1); 4560 } 4561 if (BT) { 4562 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4563 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4564 thrust::copy(titb,tite,rT); 4565 } 4566 auto cT = CcsrT->column_indices->begin(); 4567 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4568 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4569 auto vT = CcsrT->values->begin(); 4570 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4571 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4572 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4573 4574 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4575 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4576 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4577 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4578 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4579 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4580 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4581 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4582 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4583 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4584 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4585 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4586 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4587 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4588 #endif 4589 Ccusp->matTranspose = CmatT; 4590 } 4591 } 4592 4593 c->singlemalloc = PETSC_FALSE; 4594 c->free_a = PETSC_TRUE; 4595 c->free_ij = PETSC_TRUE; 4596 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4597 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4598 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4599 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4600 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4601 ii = *Ccsr->row_offsets; 4602 jj = *Ccsr->column_indices; 4603 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4604 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4605 } else { 4606 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4607 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4608 } 4609 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4610 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4611 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4612 c->maxnz = c->nz; 4613 c->nonzerorowcnt = 0; 4614 c->rmax = 0; 4615 for (i = 0; i < m; i++) { 4616 const PetscInt nn = c->i[i+1] - c->i[i]; 4617 c->ilen[i] = c->imax[i] = nn; 4618 c->nonzerorowcnt += (PetscInt)!!nn; 4619 c->rmax = PetscMax(c->rmax,nn); 4620 } 4621 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4622 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4623 (*C)->nonzerostate++; 4624 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4625 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4626 Ccusp->nonzerostate = (*C)->nonzerostate; 4627 (*C)->preallocated = PETSC_TRUE; 4628 } else { 4629 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4630 c = (Mat_SeqAIJ*)(*C)->data; 4631 if (c->nz) { 4632 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4633 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4634 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4635 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4636 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4637 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4638 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4639 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4640 Acsr = (CsrMatrix*)Acusp->mat->mat; 4641 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4642 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4643 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4644 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4645 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4646 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4647 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4648 auto pmid = Ccusp->cooPerm->begin(); 4649 thrust::advance(pmid,Acsr->num_entries); 4650 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4651 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4652 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4653 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4654 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4655 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4656 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4657 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4658 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4659 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4660 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4661 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4662 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4663 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4664 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4665 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4666 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4667 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4668 auto vT = CcsrT->values->begin(); 4669 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4670 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4671 (*C)->transupdated = PETSC_TRUE; 4672 } 4673 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4674 } 4675 } 4676 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4677 (*C)->assembled = PETSC_TRUE; 4678 (*C)->was_assembled = PETSC_FALSE; 4679 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4680 PetscFunctionReturn(0); 4681 } 4682 4683 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4684 { 4685 PetscErrorCode ierr; 4686 bool dmem; 4687 const PetscScalar *av; 4688 cudaError_t cerr; 4689 4690 PetscFunctionBegin; 4691 dmem = isCudaMem(v); 4692 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4693 if (n && idx) { 4694 THRUSTINTARRAY widx(n); 4695 widx.assign(idx,idx+n); 4696 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4697 4698 THRUSTARRAY *w = NULL; 4699 thrust::device_ptr<PetscScalar> dv; 4700 if (dmem) { 4701 dv = thrust::device_pointer_cast(v); 4702 } else { 4703 w = new THRUSTARRAY(n); 4704 dv = w->data(); 4705 } 4706 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4707 4708 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4709 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4710 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4711 if (w) { 4712 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4713 } 4714 delete w; 4715 } else { 4716 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4717 } 4718 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4719 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4720 PetscFunctionReturn(0); 4721 } 4722