1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 91 92 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 93 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 96 97 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 98 { 99 cusparseStatus_t stat; 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101 102 PetscFunctionBegin; 103 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 104 cusparsestruct->stream = stream; 105 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 106 PetscFunctionReturn(0); 107 } 108 109 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 110 { 111 cusparseStatus_t stat; 112 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 113 114 PetscFunctionBegin; 115 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 116 if (cusparsestruct->handle != handle) { 117 if (cusparsestruct->handle) { 118 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 119 } 120 cusparsestruct->handle = handle; 121 } 122 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 123 PetscFunctionReturn(0); 124 } 125 126 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 127 { 128 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 129 PetscBool flg; 130 PetscErrorCode ierr; 131 132 PetscFunctionBegin; 133 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 134 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 135 if (cusparsestruct->handle) cusparsestruct->handle = 0; 136 PetscFunctionReturn(0); 137 } 138 139 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 140 { 141 PetscFunctionBegin; 142 *type = MATSOLVERCUSPARSE; 143 PetscFunctionReturn(0); 144 } 145 146 /*MC 147 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 148 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 149 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 150 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 151 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 152 algorithms are not recommended. This class does NOT support direct solver operations. 153 154 Level: beginner 155 156 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 157 M*/ 158 159 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 160 { 161 PetscErrorCode ierr; 162 PetscInt n = A->rmap->n; 163 164 PetscFunctionBegin; 165 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 166 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 167 (*B)->factortype = ftype; 168 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 169 170 if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); } 171 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 172 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 173 if (!A->boundtocpu) { 174 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 175 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 176 } else { 177 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 178 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 179 } 180 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 181 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 182 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 183 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 184 if (!A->boundtocpu) { 185 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 186 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 187 } else { 188 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 189 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 190 } 191 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 192 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 193 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 194 195 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 196 (*B)->canuseordering = PETSC_TRUE; 197 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 198 PetscFunctionReturn(0); 199 } 200 201 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 202 { 203 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 204 205 PetscFunctionBegin; 206 switch (op) { 207 case MAT_CUSPARSE_MULT: 208 cusparsestruct->format = format; 209 break; 210 case MAT_CUSPARSE_ALL: 211 cusparsestruct->format = format; 212 break; 213 default: 214 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 215 } 216 PetscFunctionReturn(0); 217 } 218 219 /*@ 220 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 221 operation. Only the MatMult operation can use different GPU storage formats 222 for MPIAIJCUSPARSE matrices. 223 Not Collective 224 225 Input Parameters: 226 + A - Matrix of type SEQAIJCUSPARSE 227 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 228 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 229 230 Output Parameter: 231 232 Level: intermediate 233 234 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 235 @*/ 236 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 237 { 238 PetscErrorCode ierr; 239 240 PetscFunctionBegin; 241 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 242 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 247 { 248 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 249 250 PetscFunctionBegin; 251 cusparsestruct->use_cpu_solve = use_cpu; 252 PetscFunctionReturn(0); 253 } 254 255 /*@ 256 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 257 258 Input Parameters: 259 + A - Matrix of type SEQAIJCUSPARSE 260 - use_cpu - set flag for using the built-in CPU MatSolve 261 262 Output Parameter: 263 264 Notes: 265 The cuSparse LU solver currently computes the factors with the built-in CPU method 266 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 267 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 268 269 Level: intermediate 270 271 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 272 @*/ 273 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 274 { 275 PetscErrorCode ierr; 276 277 PetscFunctionBegin; 278 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 279 ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr); 280 PetscFunctionReturn(0); 281 } 282 283 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 284 { 285 PetscErrorCode ierr; 286 287 PetscFunctionBegin; 288 switch (op) { 289 case MAT_FORM_EXPLICIT_TRANSPOSE: 290 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 291 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 292 A->form_explicit_transpose = flg; 293 break; 294 default: 295 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 296 break; 297 } 298 PetscFunctionReturn(0); 299 } 300 301 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 302 303 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 304 { 305 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 306 IS isrow = b->row,iscol = b->col; 307 PetscBool row_identity,col_identity; 308 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 309 PetscErrorCode ierr; 310 311 PetscFunctionBegin; 312 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 313 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 314 B->offloadmask = PETSC_OFFLOAD_CPU; 315 /* determine which version of MatSolve needs to be used. */ 316 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 317 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 318 if (row_identity && col_identity) { 319 if (!cusparsestruct->use_cpu_solve) { 320 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 321 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 322 } 323 B->ops->matsolve = NULL; 324 B->ops->matsolvetranspose = NULL; 325 } else { 326 if (!cusparsestruct->use_cpu_solve) { 327 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 328 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 329 } 330 B->ops->matsolve = NULL; 331 B->ops->matsolvetranspose = NULL; 332 } 333 334 /* get the triangular factors */ 335 if (!cusparsestruct->use_cpu_solve) { 336 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 337 } 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 342 { 343 PetscErrorCode ierr; 344 MatCUSPARSEStorageFormat format; 345 PetscBool flg; 346 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 347 348 PetscFunctionBegin; 349 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 350 if (A->factortype == MAT_FACTOR_NONE) { 351 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 352 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 353 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 354 355 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 356 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 357 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 358 ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr); 359 if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);} 360 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 361 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 362 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 363 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 364 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 365 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 366 #else 367 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 368 #endif 369 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 370 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 371 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 372 373 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 374 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 375 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 376 #endif 377 } 378 ierr = PetscOptionsTail();CHKERRQ(ierr); 379 PetscFunctionReturn(0); 380 } 381 382 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 383 { 384 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 385 PetscErrorCode ierr; 386 387 PetscFunctionBegin; 388 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 389 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 390 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 391 PetscFunctionReturn(0); 392 } 393 394 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 395 { 396 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 397 PetscErrorCode ierr; 398 399 PetscFunctionBegin; 400 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 401 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 402 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 403 PetscFunctionReturn(0); 404 } 405 406 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 407 { 408 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 409 PetscErrorCode ierr; 410 411 PetscFunctionBegin; 412 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 413 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 414 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 415 PetscFunctionReturn(0); 416 } 417 418 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 419 { 420 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 421 PetscErrorCode ierr; 422 423 PetscFunctionBegin; 424 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 425 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 426 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 427 PetscFunctionReturn(0); 428 } 429 430 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 431 { 432 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 433 PetscInt n = A->rmap->n; 434 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 435 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 436 cusparseStatus_t stat; 437 const PetscInt *ai = a->i,*aj = a->j,*vi; 438 const MatScalar *aa = a->a,*v; 439 PetscInt *AiLo, *AjLo; 440 PetscInt i,nz, nzLower, offset, rowOffset; 441 PetscErrorCode ierr; 442 cudaError_t cerr; 443 444 PetscFunctionBegin; 445 if (!n) PetscFunctionReturn(0); 446 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 447 try { 448 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 449 nzLower=n+ai[n]-ai[1]; 450 if (!loTriFactor) { 451 PetscScalar *AALo; 452 453 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 454 455 /* Allocate Space for the lower triangular matrix */ 456 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 457 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 458 459 /* Fill the lower triangular matrix */ 460 AiLo[0] = (PetscInt) 0; 461 AiLo[n] = nzLower; 462 AjLo[0] = (PetscInt) 0; 463 AALo[0] = (MatScalar) 1.0; 464 v = aa; 465 vi = aj; 466 offset = 1; 467 rowOffset= 1; 468 for (i=1; i<n; i++) { 469 nz = ai[i+1] - ai[i]; 470 /* additional 1 for the term on the diagonal */ 471 AiLo[i] = rowOffset; 472 rowOffset += nz+1; 473 474 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 475 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 476 477 offset += nz; 478 AjLo[offset] = (PetscInt) i; 479 AALo[offset] = (MatScalar) 1.0; 480 offset += 1; 481 482 v += nz; 483 vi += nz; 484 } 485 486 /* allocate space for the triangular factor information */ 487 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 488 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 489 /* Create the matrix description */ 490 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 491 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 492 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 493 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 494 #else 495 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 496 #endif 497 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 498 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 499 500 /* set the operation */ 501 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 502 503 /* set the matrix */ 504 loTriFactor->csrMat = new CsrMatrix; 505 loTriFactor->csrMat->num_rows = n; 506 loTriFactor->csrMat->num_cols = n; 507 loTriFactor->csrMat->num_entries = nzLower; 508 509 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 510 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 511 512 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 513 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 514 515 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 516 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 517 518 /* Create the solve analysis information */ 519 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 520 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 521 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 522 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 523 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 524 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 525 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 526 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 527 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 528 #endif 529 530 /* perform the solve analysis */ 531 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 532 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 533 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 534 loTriFactor->csrMat->column_indices->data().get(), 535 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 536 loTriFactor->solveInfo, 537 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 538 #else 539 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 540 #endif 541 cerr = WaitForCUDA();CHKERRCUDA(cerr); 542 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 543 544 /* assign the pointer */ 545 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 546 loTriFactor->AA_h = AALo; 547 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 548 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 549 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 550 } else { /* update values only */ 551 if (!loTriFactor->AA_h) { 552 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 553 } 554 /* Fill the lower triangular matrix */ 555 loTriFactor->AA_h[0] = 1.0; 556 v = aa; 557 vi = aj; 558 offset = 1; 559 for (i=1; i<n; i++) { 560 nz = ai[i+1] - ai[i]; 561 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 562 offset += nz; 563 loTriFactor->AA_h[offset] = 1.0; 564 offset += 1; 565 v += nz; 566 } 567 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 568 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 569 } 570 } catch(char *ex) { 571 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 572 } 573 } 574 PetscFunctionReturn(0); 575 } 576 577 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 578 { 579 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 580 PetscInt n = A->rmap->n; 581 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 582 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 583 cusparseStatus_t stat; 584 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 585 const MatScalar *aa = a->a,*v; 586 PetscInt *AiUp, *AjUp; 587 PetscInt i,nz, nzUpper, offset; 588 PetscErrorCode ierr; 589 cudaError_t cerr; 590 591 PetscFunctionBegin; 592 if (!n) PetscFunctionReturn(0); 593 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 594 try { 595 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 596 nzUpper = adiag[0]-adiag[n]; 597 if (!upTriFactor) { 598 PetscScalar *AAUp; 599 600 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 601 602 /* Allocate Space for the upper triangular matrix */ 603 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 604 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 605 606 /* Fill the upper triangular matrix */ 607 AiUp[0]=(PetscInt) 0; 608 AiUp[n]=nzUpper; 609 offset = nzUpper; 610 for (i=n-1; i>=0; i--) { 611 v = aa + adiag[i+1] + 1; 612 vi = aj + adiag[i+1] + 1; 613 614 /* number of elements NOT on the diagonal */ 615 nz = adiag[i] - adiag[i+1]-1; 616 617 /* decrement the offset */ 618 offset -= (nz+1); 619 620 /* first, set the diagonal elements */ 621 AjUp[offset] = (PetscInt) i; 622 AAUp[offset] = (MatScalar)1./v[nz]; 623 AiUp[i] = AiUp[i+1] - (nz+1); 624 625 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 626 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 627 } 628 629 /* allocate space for the triangular factor information */ 630 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 631 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 632 633 /* Create the matrix description */ 634 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 635 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 636 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 637 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 638 #else 639 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 640 #endif 641 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 642 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 643 644 /* set the operation */ 645 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 646 647 /* set the matrix */ 648 upTriFactor->csrMat = new CsrMatrix; 649 upTriFactor->csrMat->num_rows = n; 650 upTriFactor->csrMat->num_cols = n; 651 upTriFactor->csrMat->num_entries = nzUpper; 652 653 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 654 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 655 656 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 657 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 658 659 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 660 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 661 662 /* Create the solve analysis information */ 663 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 664 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 665 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 666 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 667 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 668 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 669 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 670 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 671 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 672 #endif 673 674 /* perform the solve analysis */ 675 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 676 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 677 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 678 upTriFactor->csrMat->column_indices->data().get(), 679 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 680 upTriFactor->solveInfo, 681 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 682 #else 683 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 684 #endif 685 cerr = WaitForCUDA();CHKERRCUDA(cerr); 686 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 687 688 /* assign the pointer */ 689 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 690 upTriFactor->AA_h = AAUp; 691 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 692 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 693 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 694 } else { 695 if (!upTriFactor->AA_h) { 696 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 697 } 698 /* Fill the upper triangular matrix */ 699 offset = nzUpper; 700 for (i=n-1; i>=0; i--) { 701 v = aa + adiag[i+1] + 1; 702 703 /* number of elements NOT on the diagonal */ 704 nz = adiag[i] - adiag[i+1]-1; 705 706 /* decrement the offset */ 707 offset -= (nz+1); 708 709 /* first, set the diagonal elements */ 710 upTriFactor->AA_h[offset] = 1./v[nz]; 711 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 712 } 713 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 714 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 715 } 716 } catch(char *ex) { 717 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 718 } 719 } 720 PetscFunctionReturn(0); 721 } 722 723 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 724 { 725 PetscErrorCode ierr; 726 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 727 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 728 IS isrow = a->row,iscol = a->icol; 729 PetscBool row_identity,col_identity; 730 PetscInt n = A->rmap->n; 731 732 PetscFunctionBegin; 733 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 734 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 735 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 736 737 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 738 cusparseTriFactors->nnz=a->nz; 739 740 A->offloadmask = PETSC_OFFLOAD_BOTH; 741 /* lower triangular indices */ 742 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 743 if (!row_identity && !cusparseTriFactors->rpermIndices) { 744 const PetscInt *r; 745 746 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 747 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 748 cusparseTriFactors->rpermIndices->assign(r, r+n); 749 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 750 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 751 } 752 753 /* upper triangular indices */ 754 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 755 if (!col_identity && !cusparseTriFactors->cpermIndices) { 756 const PetscInt *c; 757 758 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 759 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 760 cusparseTriFactors->cpermIndices->assign(c, c+n); 761 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 762 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 763 } 764 PetscFunctionReturn(0); 765 } 766 767 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 768 { 769 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 770 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 771 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 772 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 773 cusparseStatus_t stat; 774 PetscErrorCode ierr; 775 cudaError_t cerr; 776 PetscInt *AiUp, *AjUp; 777 PetscScalar *AAUp; 778 PetscScalar *AALo; 779 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 780 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 781 const PetscInt *ai = b->i,*aj = b->j,*vj; 782 const MatScalar *aa = b->a,*v; 783 784 PetscFunctionBegin; 785 if (!n) PetscFunctionReturn(0); 786 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 787 try { 788 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 789 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 790 if (!upTriFactor && !loTriFactor) { 791 /* Allocate Space for the upper triangular matrix */ 792 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 793 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 794 795 /* Fill the upper triangular matrix */ 796 AiUp[0]=(PetscInt) 0; 797 AiUp[n]=nzUpper; 798 offset = 0; 799 for (i=0; i<n; i++) { 800 /* set the pointers */ 801 v = aa + ai[i]; 802 vj = aj + ai[i]; 803 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 804 805 /* first, set the diagonal elements */ 806 AjUp[offset] = (PetscInt) i; 807 AAUp[offset] = (MatScalar)1.0/v[nz]; 808 AiUp[i] = offset; 809 AALo[offset] = (MatScalar)1.0/v[nz]; 810 811 offset+=1; 812 if (nz>0) { 813 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 814 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 815 for (j=offset; j<offset+nz; j++) { 816 AAUp[j] = -AAUp[j]; 817 AALo[j] = AAUp[j]/v[nz]; 818 } 819 offset+=nz; 820 } 821 } 822 823 /* allocate space for the triangular factor information */ 824 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 825 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 826 827 /* Create the matrix description */ 828 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 829 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 830 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 831 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 832 #else 833 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 834 #endif 835 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 836 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 837 838 /* set the matrix */ 839 upTriFactor->csrMat = new CsrMatrix; 840 upTriFactor->csrMat->num_rows = A->rmap->n; 841 upTriFactor->csrMat->num_cols = A->cmap->n; 842 upTriFactor->csrMat->num_entries = a->nz; 843 844 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846 847 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849 850 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 852 853 /* set the operation */ 854 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 855 856 /* Create the solve analysis information */ 857 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 858 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 859 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 860 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 861 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 862 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 863 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 864 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 865 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 866 #endif 867 868 /* perform the solve analysis */ 869 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 870 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 871 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 872 upTriFactor->csrMat->column_indices->data().get(), 873 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 874 upTriFactor->solveInfo, 875 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 876 #else 877 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 878 #endif 879 cerr = WaitForCUDA();CHKERRCUDA(cerr); 880 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 881 882 /* assign the pointer */ 883 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 884 885 /* allocate space for the triangular factor information */ 886 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 887 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 888 889 /* Create the matrix description */ 890 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 891 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 892 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 893 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 894 #else 895 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 896 #endif 897 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 898 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 899 900 /* set the operation */ 901 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 902 903 /* set the matrix */ 904 loTriFactor->csrMat = new CsrMatrix; 905 loTriFactor->csrMat->num_rows = A->rmap->n; 906 loTriFactor->csrMat->num_cols = A->cmap->n; 907 loTriFactor->csrMat->num_entries = a->nz; 908 909 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 910 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 911 912 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 913 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 914 915 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 916 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 917 918 /* Create the solve analysis information */ 919 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 920 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 921 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 922 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 923 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 924 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 925 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 926 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 927 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 928 #endif 929 930 /* perform the solve analysis */ 931 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 932 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 933 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 934 loTriFactor->csrMat->column_indices->data().get(), 935 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 936 loTriFactor->solveInfo, 937 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 938 #else 939 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 940 #endif 941 cerr = WaitForCUDA();CHKERRCUDA(cerr); 942 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 943 944 /* assign the pointer */ 945 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 946 947 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 948 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 949 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 950 } else { 951 /* Fill the upper triangular matrix */ 952 offset = 0; 953 for (i=0; i<n; i++) { 954 /* set the pointers */ 955 v = aa + ai[i]; 956 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 957 958 /* first, set the diagonal elements */ 959 AAUp[offset] = 1.0/v[nz]; 960 AALo[offset] = 1.0/v[nz]; 961 962 offset+=1; 963 if (nz>0) { 964 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 965 for (j=offset; j<offset+nz; j++) { 966 AAUp[j] = -AAUp[j]; 967 AALo[j] = AAUp[j]/v[nz]; 968 } 969 offset+=nz; 970 } 971 } 972 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 973 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 974 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 975 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 976 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 977 } 978 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 979 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 980 } catch(char *ex) { 981 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 982 } 983 } 984 PetscFunctionReturn(0); 985 } 986 987 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 988 { 989 PetscErrorCode ierr; 990 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 991 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 992 IS ip = a->row; 993 PetscBool perm_identity; 994 PetscInt n = A->rmap->n; 995 996 PetscFunctionBegin; 997 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 998 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 999 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 1000 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 1001 1002 A->offloadmask = PETSC_OFFLOAD_BOTH; 1003 1004 /* lower triangular indices */ 1005 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1006 if (!perm_identity) { 1007 IS iip; 1008 const PetscInt *irip,*rip; 1009 1010 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 1011 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 1012 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1013 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1014 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 1015 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1016 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 1017 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 1018 ierr = ISDestroy(&iip);CHKERRQ(ierr); 1019 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1020 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1021 } 1022 PetscFunctionReturn(0); 1023 } 1024 1025 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 1026 { 1027 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 1028 IS ip = b->row; 1029 PetscBool perm_identity; 1030 PetscErrorCode ierr; 1031 1032 PetscFunctionBegin; 1033 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1034 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 1035 B->offloadmask = PETSC_OFFLOAD_CPU; 1036 /* determine which version of MatSolve needs to be used. */ 1037 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 1038 if (perm_identity) { 1039 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1040 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1041 B->ops->matsolve = NULL; 1042 B->ops->matsolvetranspose = NULL; 1043 } else { 1044 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1045 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1046 B->ops->matsolve = NULL; 1047 B->ops->matsolvetranspose = NULL; 1048 } 1049 1050 /* get the triangular factors */ 1051 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 1052 PetscFunctionReturn(0); 1053 } 1054 1055 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1056 { 1057 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1058 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1059 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1060 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1061 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1062 cusparseStatus_t stat; 1063 cusparseIndexBase_t indexBase; 1064 cusparseMatrixType_t matrixType; 1065 cusparseFillMode_t fillMode; 1066 cusparseDiagType_t diagType; 1067 cudaError_t cerr; 1068 PetscErrorCode ierr; 1069 1070 PetscFunctionBegin; 1071 /* allocate space for the transpose of the lower triangular factor */ 1072 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1073 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1074 1075 /* set the matrix descriptors of the lower triangular factor */ 1076 matrixType = cusparseGetMatType(loTriFactor->descr); 1077 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1078 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1079 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1080 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1081 1082 /* Create the matrix description */ 1083 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1084 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1085 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1086 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1087 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1088 1089 /* set the operation */ 1090 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1091 1092 /* allocate GPU space for the CSC of the lower triangular factor*/ 1093 loTriFactorT->csrMat = new CsrMatrix; 1094 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1095 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1096 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1097 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1098 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1099 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1100 1101 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1102 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1103 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1104 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1105 loTriFactor->csrMat->values->data().get(), 1106 loTriFactor->csrMat->row_offsets->data().get(), 1107 loTriFactor->csrMat->column_indices->data().get(), 1108 loTriFactorT->csrMat->values->data().get(), 1109 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1110 CUSPARSE_ACTION_NUMERIC,indexBase, 1111 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1112 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1113 #endif 1114 1115 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1116 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1117 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1118 loTriFactor->csrMat->values->data().get(), 1119 loTriFactor->csrMat->row_offsets->data().get(), 1120 loTriFactor->csrMat->column_indices->data().get(), 1121 loTriFactorT->csrMat->values->data().get(), 1122 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1123 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1124 CUSPARSE_ACTION_NUMERIC, indexBase, 1125 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1126 #else 1127 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1128 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1129 #endif 1130 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1131 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1132 1133 /* Create the solve analysis information */ 1134 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1135 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1136 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1137 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1138 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1139 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1140 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1141 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1142 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1143 #endif 1144 1145 /* perform the solve analysis */ 1146 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1147 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1148 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1149 loTriFactorT->csrMat->column_indices->data().get(), 1150 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1151 loTriFactorT->solveInfo, 1152 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1153 #else 1154 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1155 #endif 1156 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1157 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1158 1159 /* assign the pointer */ 1160 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1161 1162 /*********************************************/ 1163 /* Now the Transpose of the Upper Tri Factor */ 1164 /*********************************************/ 1165 1166 /* allocate space for the transpose of the upper triangular factor */ 1167 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1168 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1169 1170 /* set the matrix descriptors of the upper triangular factor */ 1171 matrixType = cusparseGetMatType(upTriFactor->descr); 1172 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1173 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1174 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1175 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1176 1177 /* Create the matrix description */ 1178 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1179 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1180 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1181 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1182 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1183 1184 /* set the operation */ 1185 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1186 1187 /* allocate GPU space for the CSC of the upper triangular factor*/ 1188 upTriFactorT->csrMat = new CsrMatrix; 1189 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1190 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1191 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1192 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1193 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1194 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1195 1196 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1197 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1198 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1199 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1200 upTriFactor->csrMat->values->data().get(), 1201 upTriFactor->csrMat->row_offsets->data().get(), 1202 upTriFactor->csrMat->column_indices->data().get(), 1203 upTriFactorT->csrMat->values->data().get(), 1204 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1205 CUSPARSE_ACTION_NUMERIC,indexBase, 1206 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1207 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1208 #endif 1209 1210 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1211 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1212 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1213 upTriFactor->csrMat->values->data().get(), 1214 upTriFactor->csrMat->row_offsets->data().get(), 1215 upTriFactor->csrMat->column_indices->data().get(), 1216 upTriFactorT->csrMat->values->data().get(), 1217 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1218 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1219 CUSPARSE_ACTION_NUMERIC, indexBase, 1220 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1221 #else 1222 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1223 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1224 #endif 1225 1226 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1227 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1228 1229 /* Create the solve analysis information */ 1230 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1231 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1232 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1233 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1234 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1235 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1236 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1237 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1238 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1239 #endif 1240 1241 /* perform the solve analysis */ 1242 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1243 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1244 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1245 upTriFactorT->csrMat->column_indices->data().get(), 1246 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1247 upTriFactorT->solveInfo, 1248 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1249 #else 1250 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1251 #endif 1252 1253 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1254 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1255 1256 /* assign the pointer */ 1257 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1258 PetscFunctionReturn(0); 1259 } 1260 1261 struct PetscScalarToPetscInt 1262 { 1263 __host__ __device__ 1264 PetscInt operator()(PetscScalar s) 1265 { 1266 return (PetscInt)PetscRealPart(s); 1267 } 1268 }; 1269 1270 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1271 { 1272 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1273 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1274 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1275 cusparseStatus_t stat; 1276 cusparseIndexBase_t indexBase; 1277 cudaError_t err; 1278 PetscErrorCode ierr; 1279 1280 PetscFunctionBegin; 1281 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1282 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1283 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1284 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1285 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1286 if (A->transupdated) PetscFunctionReturn(0); 1287 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1288 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1289 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1290 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1291 } 1292 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1293 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1294 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1295 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1296 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1297 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1298 1299 /* set alpha and beta */ 1300 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1301 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1302 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1303 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1304 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1305 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1306 1307 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1308 CsrMatrix *matrixT = new CsrMatrix; 1309 matstructT->mat = matrixT; 1310 matrixT->num_rows = A->cmap->n; 1311 matrixT->num_cols = A->rmap->n; 1312 matrixT->num_entries = a->nz; 1313 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1314 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1315 matrixT->values = new THRUSTARRAY(a->nz); 1316 1317 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1318 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1319 1320 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1321 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1322 stat = cusparseCreateCsr(&matstructT->matDescr, 1323 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1324 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1325 matrixT->values->data().get(), 1326 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1327 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1328 #else 1329 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1330 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1331 1332 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1333 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1334 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1335 */ 1336 if (matrixT->num_entries) { 1337 stat = cusparseCreateCsr(&matstructT->matDescr, 1338 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1339 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1340 matrixT->values->data().get(), 1341 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1342 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1343 1344 } else { 1345 matstructT->matDescr = NULL; 1346 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1347 } 1348 #endif 1349 #endif 1350 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1351 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1352 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1353 #else 1354 CsrMatrix *temp = new CsrMatrix; 1355 CsrMatrix *tempT = new CsrMatrix; 1356 /* First convert HYB to CSR */ 1357 temp->num_rows = A->rmap->n; 1358 temp->num_cols = A->cmap->n; 1359 temp->num_entries = a->nz; 1360 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1361 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1362 temp->values = new THRUSTARRAY(a->nz); 1363 1364 stat = cusparse_hyb2csr(cusparsestruct->handle, 1365 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1366 temp->values->data().get(), 1367 temp->row_offsets->data().get(), 1368 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1369 1370 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1371 tempT->num_rows = A->rmap->n; 1372 tempT->num_cols = A->cmap->n; 1373 tempT->num_entries = a->nz; 1374 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1375 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1376 tempT->values = new THRUSTARRAY(a->nz); 1377 1378 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1379 temp->num_cols, temp->num_entries, 1380 temp->values->data().get(), 1381 temp->row_offsets->data().get(), 1382 temp->column_indices->data().get(), 1383 tempT->values->data().get(), 1384 tempT->column_indices->data().get(), 1385 tempT->row_offsets->data().get(), 1386 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1387 1388 /* Last, convert CSC to HYB */ 1389 cusparseHybMat_t hybMat; 1390 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1391 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1392 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1393 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1394 matstructT->descr, tempT->values->data().get(), 1395 tempT->row_offsets->data().get(), 1396 tempT->column_indices->data().get(), 1397 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1398 1399 /* assign the pointer */ 1400 matstructT->mat = hybMat; 1401 A->transupdated = PETSC_TRUE; 1402 /* delete temporaries */ 1403 if (tempT) { 1404 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1405 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1406 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1407 delete (CsrMatrix*) tempT; 1408 } 1409 if (temp) { 1410 if (temp->values) delete (THRUSTARRAY*) temp->values; 1411 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1412 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1413 delete (CsrMatrix*) temp; 1414 } 1415 #endif 1416 } 1417 } 1418 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1419 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1420 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1421 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1422 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1423 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1424 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1425 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1426 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1427 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1428 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1429 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1430 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1431 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1432 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1433 } 1434 if (!cusparsestruct->csr2csc_i) { 1435 THRUSTARRAY csr2csc_a(matrix->num_entries); 1436 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1437 1438 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1440 void *csr2cscBuffer; 1441 size_t csr2cscBufferSize; 1442 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1443 A->cmap->n, matrix->num_entries, 1444 matrix->values->data().get(), 1445 cusparsestruct->rowoffsets_gpu->data().get(), 1446 matrix->column_indices->data().get(), 1447 matrixT->values->data().get(), 1448 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1449 CUSPARSE_ACTION_NUMERIC,indexBase, 1450 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1451 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1452 #endif 1453 1454 if (matrix->num_entries) { 1455 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1456 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1457 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1458 1459 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1460 should be filled with indexBase. So I just take a shortcut here. 1461 */ 1462 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1463 A->cmap->n,matrix->num_entries, 1464 csr2csc_a.data().get(), 1465 cusparsestruct->rowoffsets_gpu->data().get(), 1466 matrix->column_indices->data().get(), 1467 matrixT->values->data().get(), 1468 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1469 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1470 CUSPARSE_ACTION_NUMERIC,indexBase, 1471 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1472 #else 1473 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1474 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1475 #endif 1476 } else { 1477 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1478 } 1479 1480 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1481 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1482 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1483 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1484 #endif 1485 } 1486 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1487 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1488 matrixT->values->begin())); 1489 } 1490 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1491 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1492 /* the compressed row indices is not used for matTranspose */ 1493 matstructT->cprowIndices = NULL; 1494 /* assign the pointer */ 1495 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1496 A->transupdated = PETSC_TRUE; 1497 PetscFunctionReturn(0); 1498 } 1499 1500 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1501 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1502 { 1503 PetscInt n = xx->map->n; 1504 const PetscScalar *barray; 1505 PetscScalar *xarray; 1506 thrust::device_ptr<const PetscScalar> bGPU; 1507 thrust::device_ptr<PetscScalar> xGPU; 1508 cusparseStatus_t stat; 1509 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1510 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1511 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1512 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1513 PetscErrorCode ierr; 1514 1515 PetscFunctionBegin; 1516 /* Analyze the matrix and create the transpose ... on the fly */ 1517 if (!loTriFactorT && !upTriFactorT) { 1518 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1519 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1520 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1521 } 1522 1523 /* Get the GPU pointers */ 1524 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1525 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1526 xGPU = thrust::device_pointer_cast(xarray); 1527 bGPU = thrust::device_pointer_cast(barray); 1528 1529 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1530 /* First, reorder with the row permutation */ 1531 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1532 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1533 xGPU); 1534 1535 /* First, solve U */ 1536 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1537 upTriFactorT->csrMat->num_rows, 1538 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1539 upTriFactorT->csrMat->num_entries, 1540 #endif 1541 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1542 upTriFactorT->csrMat->values->data().get(), 1543 upTriFactorT->csrMat->row_offsets->data().get(), 1544 upTriFactorT->csrMat->column_indices->data().get(), 1545 upTriFactorT->solveInfo, 1546 xarray, 1547 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1548 tempGPU->data().get(), 1549 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1550 #else 1551 tempGPU->data().get());CHKERRCUSPARSE(stat); 1552 #endif 1553 1554 /* Then, solve L */ 1555 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1556 loTriFactorT->csrMat->num_rows, 1557 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1558 loTriFactorT->csrMat->num_entries, 1559 #endif 1560 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1561 loTriFactorT->csrMat->values->data().get(), 1562 loTriFactorT->csrMat->row_offsets->data().get(), 1563 loTriFactorT->csrMat->column_indices->data().get(), 1564 loTriFactorT->solveInfo, 1565 tempGPU->data().get(), 1566 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1567 xarray, 1568 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1569 #else 1570 xarray);CHKERRCUSPARSE(stat); 1571 #endif 1572 1573 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1574 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1575 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1576 tempGPU->begin()); 1577 1578 /* Copy the temporary to the full solution. */ 1579 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1580 1581 /* restore */ 1582 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1583 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1584 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1585 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1586 PetscFunctionReturn(0); 1587 } 1588 1589 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1590 { 1591 const PetscScalar *barray; 1592 PetscScalar *xarray; 1593 cusparseStatus_t stat; 1594 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1595 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1596 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1597 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1598 PetscErrorCode ierr; 1599 1600 PetscFunctionBegin; 1601 /* Analyze the matrix and create the transpose ... on the fly */ 1602 if (!loTriFactorT && !upTriFactorT) { 1603 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1604 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1605 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1606 } 1607 1608 /* Get the GPU pointers */ 1609 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1610 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1611 1612 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1613 /* First, solve U */ 1614 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1615 upTriFactorT->csrMat->num_rows, 1616 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1617 upTriFactorT->csrMat->num_entries, 1618 #endif 1619 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1620 upTriFactorT->csrMat->values->data().get(), 1621 upTriFactorT->csrMat->row_offsets->data().get(), 1622 upTriFactorT->csrMat->column_indices->data().get(), 1623 upTriFactorT->solveInfo, 1624 barray, 1625 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1626 tempGPU->data().get(), 1627 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1628 #else 1629 tempGPU->data().get());CHKERRCUSPARSE(stat); 1630 #endif 1631 1632 /* Then, solve L */ 1633 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1634 loTriFactorT->csrMat->num_rows, 1635 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1636 loTriFactorT->csrMat->num_entries, 1637 #endif 1638 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1639 loTriFactorT->csrMat->values->data().get(), 1640 loTriFactorT->csrMat->row_offsets->data().get(), 1641 loTriFactorT->csrMat->column_indices->data().get(), 1642 loTriFactorT->solveInfo, 1643 tempGPU->data().get(), 1644 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1645 xarray, 1646 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1647 #else 1648 xarray);CHKERRCUSPARSE(stat); 1649 #endif 1650 1651 /* restore */ 1652 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1653 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1654 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1655 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1660 { 1661 const PetscScalar *barray; 1662 PetscScalar *xarray; 1663 thrust::device_ptr<const PetscScalar> bGPU; 1664 thrust::device_ptr<PetscScalar> xGPU; 1665 cusparseStatus_t stat; 1666 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1667 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1668 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1669 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1670 PetscErrorCode ierr; 1671 1672 PetscFunctionBegin; 1673 1674 /* Get the GPU pointers */ 1675 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1676 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1677 xGPU = thrust::device_pointer_cast(xarray); 1678 bGPU = thrust::device_pointer_cast(barray); 1679 1680 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1681 /* First, reorder with the row permutation */ 1682 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1683 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1684 tempGPU->begin()); 1685 1686 /* Next, solve L */ 1687 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1688 loTriFactor->csrMat->num_rows, 1689 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1690 loTriFactor->csrMat->num_entries, 1691 #endif 1692 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1693 loTriFactor->csrMat->values->data().get(), 1694 loTriFactor->csrMat->row_offsets->data().get(), 1695 loTriFactor->csrMat->column_indices->data().get(), 1696 loTriFactor->solveInfo, 1697 tempGPU->data().get(), 1698 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1699 xarray, 1700 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1701 #else 1702 xarray);CHKERRCUSPARSE(stat); 1703 #endif 1704 1705 /* Then, solve U */ 1706 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1707 upTriFactor->csrMat->num_rows, 1708 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1709 upTriFactor->csrMat->num_entries, 1710 #endif 1711 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1712 upTriFactor->csrMat->values->data().get(), 1713 upTriFactor->csrMat->row_offsets->data().get(), 1714 upTriFactor->csrMat->column_indices->data().get(), 1715 upTriFactor->solveInfo,xarray, 1716 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1717 tempGPU->data().get(), 1718 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1719 #else 1720 tempGPU->data().get());CHKERRCUSPARSE(stat); 1721 #endif 1722 1723 /* Last, reorder with the column permutation */ 1724 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1725 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1726 xGPU); 1727 1728 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1729 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1730 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1731 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1732 PetscFunctionReturn(0); 1733 } 1734 1735 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1736 { 1737 const PetscScalar *barray; 1738 PetscScalar *xarray; 1739 cusparseStatus_t stat; 1740 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1741 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1742 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1743 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1744 PetscErrorCode ierr; 1745 1746 PetscFunctionBegin; 1747 /* Get the GPU pointers */ 1748 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1749 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1750 1751 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1752 /* First, solve L */ 1753 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1754 loTriFactor->csrMat->num_rows, 1755 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1756 loTriFactor->csrMat->num_entries, 1757 #endif 1758 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1759 loTriFactor->csrMat->values->data().get(), 1760 loTriFactor->csrMat->row_offsets->data().get(), 1761 loTriFactor->csrMat->column_indices->data().get(), 1762 loTriFactor->solveInfo, 1763 barray, 1764 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1765 tempGPU->data().get(), 1766 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1767 #else 1768 tempGPU->data().get());CHKERRCUSPARSE(stat); 1769 #endif 1770 1771 /* Next, solve U */ 1772 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1773 upTriFactor->csrMat->num_rows, 1774 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1775 upTriFactor->csrMat->num_entries, 1776 #endif 1777 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1778 upTriFactor->csrMat->values->data().get(), 1779 upTriFactor->csrMat->row_offsets->data().get(), 1780 upTriFactor->csrMat->column_indices->data().get(), 1781 upTriFactor->solveInfo, 1782 tempGPU->data().get(), 1783 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1784 xarray, 1785 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1786 #else 1787 xarray);CHKERRCUSPARSE(stat); 1788 #endif 1789 1790 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1791 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1792 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1793 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1794 PetscFunctionReturn(0); 1795 } 1796 1797 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1798 { 1799 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1800 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1801 cudaError_t cerr; 1802 PetscErrorCode ierr; 1803 1804 PetscFunctionBegin; 1805 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1806 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1807 1808 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1809 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1810 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1811 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1812 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1813 A->offloadmask = PETSC_OFFLOAD_BOTH; 1814 } 1815 PetscFunctionReturn(0); 1816 } 1817 1818 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1819 { 1820 PetscErrorCode ierr; 1821 1822 PetscFunctionBegin; 1823 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1824 *array = ((Mat_SeqAIJ*)A->data)->a; 1825 PetscFunctionReturn(0); 1826 } 1827 1828 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1829 { 1830 PetscFunctionBegin; 1831 A->offloadmask = PETSC_OFFLOAD_CPU; 1832 *array = NULL; 1833 PetscFunctionReturn(0); 1834 } 1835 1836 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1837 { 1838 PetscErrorCode ierr; 1839 1840 PetscFunctionBegin; 1841 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1842 *array = ((Mat_SeqAIJ*)A->data)->a; 1843 PetscFunctionReturn(0); 1844 } 1845 1846 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1847 { 1848 PetscFunctionBegin; 1849 *array = NULL; 1850 PetscFunctionReturn(0); 1851 } 1852 1853 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1854 { 1855 PetscFunctionBegin; 1856 *array = ((Mat_SeqAIJ*)A->data)->a; 1857 PetscFunctionReturn(0); 1858 } 1859 1860 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1861 { 1862 PetscFunctionBegin; 1863 A->offloadmask = PETSC_OFFLOAD_CPU; 1864 *array = NULL; 1865 PetscFunctionReturn(0); 1866 } 1867 1868 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1869 { 1870 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1871 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1872 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1873 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1874 PetscErrorCode ierr; 1875 cusparseStatus_t stat; 1876 PetscBool both = PETSC_TRUE; 1877 cudaError_t err; 1878 1879 PetscFunctionBegin; 1880 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1881 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1882 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1883 CsrMatrix *matrix; 1884 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1885 1886 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1887 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1888 matrix->values->assign(a->a, a->a+a->nz); 1889 err = WaitForCUDA();CHKERRCUDA(err); 1890 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1891 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1892 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1893 } else { 1894 PetscInt nnz; 1895 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1896 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1897 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1898 delete cusparsestruct->workVector; 1899 delete cusparsestruct->rowoffsets_gpu; 1900 cusparsestruct->workVector = NULL; 1901 cusparsestruct->rowoffsets_gpu = NULL; 1902 try { 1903 if (a->compressedrow.use) { 1904 m = a->compressedrow.nrows; 1905 ii = a->compressedrow.i; 1906 ridx = a->compressedrow.rindex; 1907 } else { 1908 m = A->rmap->n; 1909 ii = a->i; 1910 ridx = NULL; 1911 } 1912 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1913 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1914 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1915 else nnz = a->nz; 1916 1917 /* create cusparse matrix */ 1918 cusparsestruct->nrows = m; 1919 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1920 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1921 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1922 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1923 1924 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1925 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1926 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1927 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1928 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1929 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1930 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1931 1932 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1933 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1934 /* set the matrix */ 1935 CsrMatrix *mat= new CsrMatrix; 1936 mat->num_rows = m; 1937 mat->num_cols = A->cmap->n; 1938 mat->num_entries = nnz; 1939 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1940 mat->row_offsets->assign(ii, ii + m+1); 1941 1942 mat->column_indices = new THRUSTINTARRAY32(nnz); 1943 mat->column_indices->assign(a->j, a->j+nnz); 1944 1945 mat->values = new THRUSTARRAY(nnz); 1946 if (a->a) mat->values->assign(a->a, a->a+nnz); 1947 1948 /* assign the pointer */ 1949 matstruct->mat = mat; 1950 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1951 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1952 stat = cusparseCreateCsr(&matstruct->matDescr, 1953 mat->num_rows, mat->num_cols, mat->num_entries, 1954 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1955 mat->values->data().get(), 1956 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1957 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1958 } 1959 #endif 1960 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1961 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1962 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1963 #else 1964 CsrMatrix *mat= new CsrMatrix; 1965 mat->num_rows = m; 1966 mat->num_cols = A->cmap->n; 1967 mat->num_entries = nnz; 1968 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1969 mat->row_offsets->assign(ii, ii + m+1); 1970 1971 mat->column_indices = new THRUSTINTARRAY32(nnz); 1972 mat->column_indices->assign(a->j, a->j+nnz); 1973 1974 mat->values = new THRUSTARRAY(nnz); 1975 if (a->a) mat->values->assign(a->a, a->a+nnz); 1976 1977 cusparseHybMat_t hybMat; 1978 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1979 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1980 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1981 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1982 matstruct->descr, mat->values->data().get(), 1983 mat->row_offsets->data().get(), 1984 mat->column_indices->data().get(), 1985 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1986 /* assign the pointer */ 1987 matstruct->mat = hybMat; 1988 1989 if (mat) { 1990 if (mat->values) delete (THRUSTARRAY*)mat->values; 1991 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1992 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1993 delete (CsrMatrix*)mat; 1994 } 1995 #endif 1996 } 1997 1998 /* assign the compressed row indices */ 1999 if (a->compressedrow.use) { 2000 cusparsestruct->workVector = new THRUSTARRAY(m); 2001 matstruct->cprowIndices = new THRUSTINTARRAY(m); 2002 matstruct->cprowIndices->assign(ridx,ridx+m); 2003 tmp = m; 2004 } else { 2005 cusparsestruct->workVector = NULL; 2006 matstruct->cprowIndices = NULL; 2007 tmp = 0; 2008 } 2009 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 2010 2011 /* assign the pointer */ 2012 cusparsestruct->mat = matstruct; 2013 } catch(char *ex) { 2014 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2015 } 2016 err = WaitForCUDA();CHKERRCUDA(err); 2017 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 2018 cusparsestruct->nonzerostate = A->nonzerostate; 2019 } 2020 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2021 } 2022 PetscFunctionReturn(0); 2023 } 2024 2025 struct VecCUDAPlusEquals 2026 { 2027 template <typename Tuple> 2028 __host__ __device__ 2029 void operator()(Tuple t) 2030 { 2031 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2032 } 2033 }; 2034 2035 struct VecCUDAEquals 2036 { 2037 template <typename Tuple> 2038 __host__ __device__ 2039 void operator()(Tuple t) 2040 { 2041 thrust::get<1>(t) = thrust::get<0>(t); 2042 } 2043 }; 2044 2045 struct VecCUDAEqualsReverse 2046 { 2047 template <typename Tuple> 2048 __host__ __device__ 2049 void operator()(Tuple t) 2050 { 2051 thrust::get<0>(t) = thrust::get<1>(t); 2052 } 2053 }; 2054 2055 struct MatMatCusparse { 2056 PetscBool cisdense; 2057 PetscScalar *Bt; 2058 Mat X; 2059 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2060 PetscLogDouble flops; 2061 CsrMatrix *Bcsr; 2062 2063 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2064 cusparseSpMatDescr_t matSpBDescr; 2065 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2066 cusparseDnMatDescr_t matBDescr; 2067 cusparseDnMatDescr_t matCDescr; 2068 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2069 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2070 void *dBuffer4; 2071 void *dBuffer5; 2072 #endif 2073 size_t mmBufferSize; 2074 void *mmBuffer; 2075 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2076 cusparseSpGEMMDescr_t spgemmDesc; 2077 #endif 2078 }; 2079 2080 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2081 { 2082 PetscErrorCode ierr; 2083 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2084 cudaError_t cerr; 2085 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2086 cusparseStatus_t stat; 2087 #endif 2088 2089 PetscFunctionBegin; 2090 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 2091 delete mmdata->Bcsr; 2092 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2093 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 2094 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 2095 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 2096 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 2097 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2098 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 2099 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2100 #endif 2101 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2102 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2103 #endif 2104 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2105 ierr = PetscFree(data);CHKERRQ(ierr); 2106 PetscFunctionReturn(0); 2107 } 2108 2109 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2110 2111 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2112 { 2113 Mat_Product *product = C->product; 2114 Mat A,B; 2115 PetscInt m,n,blda,clda; 2116 PetscBool flg,biscuda; 2117 Mat_SeqAIJCUSPARSE *cusp; 2118 cusparseStatus_t stat; 2119 cusparseOperation_t opA; 2120 const PetscScalar *barray; 2121 PetscScalar *carray; 2122 PetscErrorCode ierr; 2123 MatMatCusparse *mmdata; 2124 Mat_SeqAIJCUSPARSEMultStruct *mat; 2125 CsrMatrix *csrmat; 2126 2127 PetscFunctionBegin; 2128 MatCheckProduct(C,1); 2129 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2130 mmdata = (MatMatCusparse*)product->data; 2131 A = product->A; 2132 B = product->B; 2133 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2134 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2135 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2136 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2137 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2138 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2139 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2140 switch (product->type) { 2141 case MATPRODUCT_AB: 2142 case MATPRODUCT_PtAP: 2143 mat = cusp->mat; 2144 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2145 m = A->rmap->n; 2146 n = B->cmap->n; 2147 break; 2148 case MATPRODUCT_AtB: 2149 if (!A->form_explicit_transpose) { 2150 mat = cusp->mat; 2151 opA = CUSPARSE_OPERATION_TRANSPOSE; 2152 } else { 2153 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2154 mat = cusp->matTranspose; 2155 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2156 } 2157 m = A->cmap->n; 2158 n = B->cmap->n; 2159 break; 2160 case MATPRODUCT_ABt: 2161 case MATPRODUCT_RARt: 2162 mat = cusp->mat; 2163 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2164 m = A->rmap->n; 2165 n = B->rmap->n; 2166 break; 2167 default: 2168 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2169 } 2170 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2171 csrmat = (CsrMatrix*)mat->mat; 2172 /* if the user passed a CPU matrix, copy the data to the GPU */ 2173 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2174 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2175 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2176 2177 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2178 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2179 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2180 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2181 } else { 2182 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2183 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2184 } 2185 2186 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2187 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2188 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2189 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2190 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2191 size_t mmBufferSize; 2192 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2193 if (!mmdata->matBDescr) { 2194 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2195 mmdata->Blda = blda; 2196 } 2197 2198 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2199 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2200 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2201 mmdata->Clda = clda; 2202 } 2203 2204 if (!mat->matDescr) { 2205 stat = cusparseCreateCsr(&mat->matDescr, 2206 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2207 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2208 csrmat->values->data().get(), 2209 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2210 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2211 } 2212 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2213 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2214 mmdata->matCDescr,cusparse_scalartype, 2215 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2216 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2217 cudaError_t cerr; 2218 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2219 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2220 mmdata->mmBufferSize = mmBufferSize; 2221 } 2222 mmdata->initialized = PETSC_TRUE; 2223 } else { 2224 /* to be safe, always update pointers of the mats */ 2225 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2226 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2227 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2228 } 2229 2230 /* do cusparseSpMM, which supports transpose on B */ 2231 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2232 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2233 mmdata->matCDescr,cusparse_scalartype, 2234 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2235 #else 2236 PetscInt k; 2237 /* cusparseXcsrmm does not support transpose on B */ 2238 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2239 cublasHandle_t cublasv2handle; 2240 cublasStatus_t cerr; 2241 2242 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2243 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2244 B->cmap->n,B->rmap->n, 2245 &PETSC_CUSPARSE_ONE ,barray,blda, 2246 &PETSC_CUSPARSE_ZERO,barray,blda, 2247 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2248 blda = B->cmap->n; 2249 k = B->cmap->n; 2250 } else { 2251 k = B->rmap->n; 2252 } 2253 2254 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2255 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2256 csrmat->num_entries,mat->alpha_one,mat->descr, 2257 csrmat->values->data().get(), 2258 csrmat->row_offsets->data().get(), 2259 csrmat->column_indices->data().get(), 2260 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2261 carray,clda);CHKERRCUSPARSE(stat); 2262 #endif 2263 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2264 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2265 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2266 if (product->type == MATPRODUCT_RARt) { 2267 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2268 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2269 } else if (product->type == MATPRODUCT_PtAP) { 2270 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2271 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2272 } else { 2273 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2274 } 2275 if (mmdata->cisdense) { 2276 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2277 } 2278 if (!biscuda) { 2279 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2280 } 2281 PetscFunctionReturn(0); 2282 } 2283 2284 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2285 { 2286 Mat_Product *product = C->product; 2287 Mat A,B; 2288 PetscInt m,n; 2289 PetscBool cisdense,flg; 2290 PetscErrorCode ierr; 2291 MatMatCusparse *mmdata; 2292 Mat_SeqAIJCUSPARSE *cusp; 2293 2294 PetscFunctionBegin; 2295 MatCheckProduct(C,1); 2296 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2297 A = product->A; 2298 B = product->B; 2299 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2300 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2301 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2302 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2303 switch (product->type) { 2304 case MATPRODUCT_AB: 2305 m = A->rmap->n; 2306 n = B->cmap->n; 2307 break; 2308 case MATPRODUCT_AtB: 2309 m = A->cmap->n; 2310 n = B->cmap->n; 2311 break; 2312 case MATPRODUCT_ABt: 2313 m = A->rmap->n; 2314 n = B->rmap->n; 2315 break; 2316 case MATPRODUCT_PtAP: 2317 m = B->cmap->n; 2318 n = B->cmap->n; 2319 break; 2320 case MATPRODUCT_RARt: 2321 m = B->rmap->n; 2322 n = B->rmap->n; 2323 break; 2324 default: 2325 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2326 } 2327 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2328 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2329 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2330 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2331 2332 /* product data */ 2333 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2334 mmdata->cisdense = cisdense; 2335 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2336 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2337 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2338 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2339 } 2340 #endif 2341 /* for these products we need intermediate storage */ 2342 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2343 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2344 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2345 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2346 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2347 } else { 2348 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2349 } 2350 } 2351 C->product->data = mmdata; 2352 C->product->destroy = MatDestroy_MatMatCusparse; 2353 2354 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2355 PetscFunctionReturn(0); 2356 } 2357 2358 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2359 { 2360 Mat_Product *product = C->product; 2361 Mat A,B; 2362 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2363 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2364 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2365 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2366 PetscBool flg; 2367 PetscErrorCode ierr; 2368 cusparseStatus_t stat; 2369 cudaError_t cerr; 2370 MatProductType ptype; 2371 MatMatCusparse *mmdata; 2372 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2373 cusparseSpMatDescr_t BmatSpDescr; 2374 #endif 2375 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2376 2377 PetscFunctionBegin; 2378 MatCheckProduct(C,1); 2379 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2380 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2381 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2382 mmdata = (MatMatCusparse*)C->product->data; 2383 A = product->A; 2384 B = product->B; 2385 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2386 mmdata->reusesym = PETSC_FALSE; 2387 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2388 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2389 Cmat = Ccusp->mat; 2390 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2391 Ccsr = (CsrMatrix*)Cmat->mat; 2392 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2393 goto finalize; 2394 } 2395 if (!c->nz) goto finalize; 2396 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2397 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2398 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2399 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2400 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2401 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2402 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2403 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2404 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2405 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2406 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2407 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2408 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2409 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2410 2411 ptype = product->type; 2412 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2413 ptype = MATPRODUCT_AB; 2414 if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2415 } 2416 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2417 ptype = MATPRODUCT_AB; 2418 if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2419 } 2420 switch (ptype) { 2421 case MATPRODUCT_AB: 2422 Amat = Acusp->mat; 2423 Bmat = Bcusp->mat; 2424 break; 2425 case MATPRODUCT_AtB: 2426 Amat = Acusp->matTranspose; 2427 Bmat = Bcusp->mat; 2428 break; 2429 case MATPRODUCT_ABt: 2430 Amat = Acusp->mat; 2431 Bmat = Bcusp->matTranspose; 2432 break; 2433 default: 2434 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2435 } 2436 Cmat = Ccusp->mat; 2437 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2438 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2439 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2440 Acsr = (CsrMatrix*)Amat->mat; 2441 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2442 Ccsr = (CsrMatrix*)Cmat->mat; 2443 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2444 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2445 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2446 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2447 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2448 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2449 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2450 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2451 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2452 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2453 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2454 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2455 #else 2456 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2457 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2458 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2459 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2460 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2461 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2462 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2463 #endif 2464 #else 2465 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2466 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2467 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2468 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2469 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2470 #endif 2471 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2472 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2473 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2474 C->offloadmask = PETSC_OFFLOAD_GPU; 2475 finalize: 2476 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2477 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2478 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2479 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2480 c->reallocs = 0; 2481 C->info.mallocs += 0; 2482 C->info.nz_unneeded = 0; 2483 C->assembled = C->was_assembled = PETSC_TRUE; 2484 C->num_ass++; 2485 PetscFunctionReturn(0); 2486 } 2487 2488 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2489 { 2490 Mat_Product *product = C->product; 2491 Mat A,B; 2492 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2493 Mat_SeqAIJ *a,*b,*c; 2494 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2495 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2496 PetscInt i,j,m,n,k; 2497 PetscBool flg; 2498 PetscErrorCode ierr; 2499 cusparseStatus_t stat; 2500 cudaError_t cerr; 2501 MatProductType ptype; 2502 MatMatCusparse *mmdata; 2503 PetscLogDouble flops; 2504 PetscBool biscompressed,ciscompressed; 2505 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2506 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2507 cusparseSpMatDescr_t BmatSpDescr; 2508 #else 2509 int cnz; 2510 #endif 2511 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2512 2513 PetscFunctionBegin; 2514 MatCheckProduct(C,1); 2515 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2516 A = product->A; 2517 B = product->B; 2518 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2519 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2520 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2521 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2522 a = (Mat_SeqAIJ*)A->data; 2523 b = (Mat_SeqAIJ*)B->data; 2524 /* product data */ 2525 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2526 C->product->data = mmdata; 2527 C->product->destroy = MatDestroy_MatMatCusparse; 2528 2529 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2530 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2531 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2532 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2533 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2534 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2535 2536 ptype = product->type; 2537 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2538 ptype = MATPRODUCT_AB; 2539 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2540 } 2541 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2542 ptype = MATPRODUCT_AB; 2543 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2544 } 2545 biscompressed = PETSC_FALSE; 2546 ciscompressed = PETSC_FALSE; 2547 switch (ptype) { 2548 case MATPRODUCT_AB: 2549 m = A->rmap->n; 2550 n = B->cmap->n; 2551 k = A->cmap->n; 2552 Amat = Acusp->mat; 2553 Bmat = Bcusp->mat; 2554 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2555 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2556 break; 2557 case MATPRODUCT_AtB: 2558 m = A->cmap->n; 2559 n = B->cmap->n; 2560 k = A->rmap->n; 2561 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2562 Amat = Acusp->matTranspose; 2563 Bmat = Bcusp->mat; 2564 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2565 break; 2566 case MATPRODUCT_ABt: 2567 m = A->rmap->n; 2568 n = B->rmap->n; 2569 k = A->cmap->n; 2570 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2571 Amat = Acusp->mat; 2572 Bmat = Bcusp->matTranspose; 2573 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2574 break; 2575 default: 2576 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2577 } 2578 2579 /* create cusparse matrix */ 2580 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2581 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2582 c = (Mat_SeqAIJ*)C->data; 2583 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2584 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2585 Ccsr = new CsrMatrix; 2586 2587 c->compressedrow.use = ciscompressed; 2588 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2589 c->compressedrow.nrows = a->compressedrow.nrows; 2590 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2591 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2592 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2593 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2594 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2595 } else { 2596 c->compressedrow.nrows = 0; 2597 c->compressedrow.i = NULL; 2598 c->compressedrow.rindex = NULL; 2599 Ccusp->workVector = NULL; 2600 Cmat->cprowIndices = NULL; 2601 } 2602 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2603 Ccusp->mat = Cmat; 2604 Ccusp->mat->mat = Ccsr; 2605 Ccsr->num_rows = Ccusp->nrows; 2606 Ccsr->num_cols = n; 2607 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2608 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2609 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2610 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2611 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2612 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2613 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2614 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2615 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2616 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2617 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2618 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2619 c->nz = 0; 2620 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2621 Ccsr->values = new THRUSTARRAY(c->nz); 2622 goto finalizesym; 2623 } 2624 2625 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2626 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2627 Acsr = (CsrMatrix*)Amat->mat; 2628 if (!biscompressed) { 2629 Bcsr = (CsrMatrix*)Bmat->mat; 2630 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2631 BmatSpDescr = Bmat->matDescr; 2632 #endif 2633 } else { /* we need to use row offsets for the full matrix */ 2634 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2635 Bcsr = new CsrMatrix; 2636 Bcsr->num_rows = B->rmap->n; 2637 Bcsr->num_cols = cBcsr->num_cols; 2638 Bcsr->num_entries = cBcsr->num_entries; 2639 Bcsr->column_indices = cBcsr->column_indices; 2640 Bcsr->values = cBcsr->values; 2641 if (!Bcusp->rowoffsets_gpu) { 2642 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2643 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2644 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2645 } 2646 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2647 mmdata->Bcsr = Bcsr; 2648 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2649 if (Bcsr->num_rows && Bcsr->num_cols) { 2650 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2651 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2652 Bcsr->values->data().get(), 2653 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2654 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2655 } 2656 BmatSpDescr = mmdata->matSpBDescr; 2657 #endif 2658 } 2659 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2660 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2661 /* precompute flops count */ 2662 if (ptype == MATPRODUCT_AB) { 2663 for (i=0, flops = 0; i<A->rmap->n; i++) { 2664 const PetscInt st = a->i[i]; 2665 const PetscInt en = a->i[i+1]; 2666 for (j=st; j<en; j++) { 2667 const PetscInt brow = a->j[j]; 2668 flops += 2.*(b->i[brow+1] - b->i[brow]); 2669 } 2670 } 2671 } else if (ptype == MATPRODUCT_AtB) { 2672 for (i=0, flops = 0; i<A->rmap->n; i++) { 2673 const PetscInt anzi = a->i[i+1] - a->i[i]; 2674 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2675 flops += (2.*anzi)*bnzi; 2676 } 2677 } else { /* TODO */ 2678 flops = 0.; 2679 } 2680 2681 mmdata->flops = flops; 2682 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2683 2684 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2685 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2686 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2687 NULL, NULL, NULL, 2688 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2689 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2690 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2691 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2692 { 2693 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2694 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2695 */ 2696 void* dBuffer1 = NULL; 2697 void* dBuffer2 = NULL; 2698 void* dBuffer3 = NULL; 2699 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2700 size_t bufferSize1 = 0; 2701 size_t bufferSize2 = 0; 2702 size_t bufferSize3 = 0; 2703 size_t bufferSize4 = 0; 2704 size_t bufferSize5 = 0; 2705 2706 /*----------------------------------------------------------------------*/ 2707 /* ask bufferSize1 bytes for external memory */ 2708 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2709 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2710 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2711 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2712 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2713 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2714 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2715 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2716 2717 /*----------------------------------------------------------------------*/ 2718 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2719 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2720 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2721 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2722 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2723 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2724 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2725 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2726 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2727 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2728 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2729 2730 /*----------------------------------------------------------------------*/ 2731 /* get matrix C non-zero entries C_nnz1 */ 2732 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2733 c->nz = (PetscInt) C_nnz1; 2734 /* allocate matrix C */ 2735 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2736 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2737 /* update matC with the new pointers */ 2738 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2739 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2740 2741 /*----------------------------------------------------------------------*/ 2742 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2743 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2744 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2745 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2746 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2747 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2748 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2749 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2750 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2751 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2752 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2753 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2754 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2755 } 2756 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2757 size_t bufSize2; 2758 /* ask bufferSize bytes for external memory */ 2759 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2760 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2761 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2762 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2763 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2764 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2765 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2766 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2767 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2768 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2769 /* ask bufferSize again bytes for external memory */ 2770 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2771 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2772 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2773 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2774 /* The CUSPARSE documentation is not clear, nor the API 2775 We need both buffers to perform the operations properly! 2776 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2777 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2778 is stored in the descriptor! What a messy API... */ 2779 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2780 /* compute the intermediate product of A * B */ 2781 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2782 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2783 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2784 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2785 /* get matrix C non-zero entries C_nnz1 */ 2786 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2787 c->nz = (PetscInt) C_nnz1; 2788 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2789 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2790 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2791 Ccsr->values = new THRUSTARRAY(c->nz); 2792 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2793 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2794 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2795 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2796 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2797 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2798 #endif 2799 #else 2800 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2801 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2802 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2803 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2804 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2805 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2806 c->nz = cnz; 2807 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2808 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2809 Ccsr->values = new THRUSTARRAY(c->nz); 2810 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2811 2812 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2813 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2814 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2815 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2816 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2817 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2818 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2819 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2820 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2821 #endif 2822 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2823 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2824 finalizesym: 2825 c->singlemalloc = PETSC_FALSE; 2826 c->free_a = PETSC_TRUE; 2827 c->free_ij = PETSC_TRUE; 2828 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2829 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2830 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2831 PetscInt *d_i = c->i; 2832 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2833 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2834 ii = *Ccsr->row_offsets; 2835 jj = *Ccsr->column_indices; 2836 if (ciscompressed) d_i = c->compressedrow.i; 2837 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2838 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2839 } else { 2840 PetscInt *d_i = c->i; 2841 if (ciscompressed) d_i = c->compressedrow.i; 2842 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2843 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2844 } 2845 if (ciscompressed) { /* need to expand host row offsets */ 2846 PetscInt r = 0; 2847 c->i[0] = 0; 2848 for (k = 0; k < c->compressedrow.nrows; k++) { 2849 const PetscInt next = c->compressedrow.rindex[k]; 2850 const PetscInt old = c->compressedrow.i[k]; 2851 for (; r < next; r++) c->i[r+1] = old; 2852 } 2853 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2854 } 2855 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2856 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2857 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2858 c->maxnz = c->nz; 2859 c->nonzerorowcnt = 0; 2860 c->rmax = 0; 2861 for (k = 0; k < m; k++) { 2862 const PetscInt nn = c->i[k+1] - c->i[k]; 2863 c->ilen[k] = c->imax[k] = nn; 2864 c->nonzerorowcnt += (PetscInt)!!nn; 2865 c->rmax = PetscMax(c->rmax,nn); 2866 } 2867 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2868 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2869 Ccsr->num_entries = c->nz; 2870 2871 C->nonzerostate++; 2872 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2873 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2874 Ccusp->nonzerostate = C->nonzerostate; 2875 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2876 C->preallocated = PETSC_TRUE; 2877 C->assembled = PETSC_FALSE; 2878 C->was_assembled = PETSC_FALSE; 2879 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2880 mmdata->reusesym = PETSC_TRUE; 2881 C->offloadmask = PETSC_OFFLOAD_GPU; 2882 } 2883 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2884 PetscFunctionReturn(0); 2885 } 2886 2887 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2888 2889 /* handles sparse or dense B */ 2890 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2891 { 2892 Mat_Product *product = mat->product; 2893 PetscErrorCode ierr; 2894 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2895 2896 PetscFunctionBegin; 2897 MatCheckProduct(mat,1); 2898 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2899 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2900 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2901 } 2902 if (product->type == MATPRODUCT_ABC) { 2903 Ciscusp = PETSC_FALSE; 2904 if (!product->C->boundtocpu) { 2905 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2906 } 2907 } 2908 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2909 PetscBool usecpu = PETSC_FALSE; 2910 switch (product->type) { 2911 case MATPRODUCT_AB: 2912 if (product->api_user) { 2913 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2914 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2915 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2916 } else { 2917 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2918 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2919 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2920 } 2921 break; 2922 case MATPRODUCT_AtB: 2923 if (product->api_user) { 2924 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2925 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2926 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2927 } else { 2928 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2929 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2930 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2931 } 2932 break; 2933 case MATPRODUCT_PtAP: 2934 if (product->api_user) { 2935 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2936 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2937 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2938 } else { 2939 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2940 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2941 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2942 } 2943 break; 2944 case MATPRODUCT_RARt: 2945 if (product->api_user) { 2946 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2947 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2948 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2949 } else { 2950 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2951 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2952 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2953 } 2954 break; 2955 case MATPRODUCT_ABC: 2956 if (product->api_user) { 2957 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2958 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2959 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2960 } else { 2961 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2962 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2963 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2964 } 2965 break; 2966 default: 2967 break; 2968 } 2969 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2970 } 2971 /* dispatch */ 2972 if (isdense) { 2973 switch (product->type) { 2974 case MATPRODUCT_AB: 2975 case MATPRODUCT_AtB: 2976 case MATPRODUCT_ABt: 2977 case MATPRODUCT_PtAP: 2978 case MATPRODUCT_RARt: 2979 if (product->A->boundtocpu) { 2980 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2981 } else { 2982 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2983 } 2984 break; 2985 case MATPRODUCT_ABC: 2986 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2987 break; 2988 default: 2989 break; 2990 } 2991 } else if (Biscusp && Ciscusp) { 2992 switch (product->type) { 2993 case MATPRODUCT_AB: 2994 case MATPRODUCT_AtB: 2995 case MATPRODUCT_ABt: 2996 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2997 break; 2998 case MATPRODUCT_PtAP: 2999 case MATPRODUCT_RARt: 3000 case MATPRODUCT_ABC: 3001 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3002 break; 3003 default: 3004 break; 3005 } 3006 } else { /* fallback for AIJ */ 3007 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 3008 } 3009 PetscFunctionReturn(0); 3010 } 3011 3012 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3013 { 3014 PetscErrorCode ierr; 3015 3016 PetscFunctionBegin; 3017 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3018 PetscFunctionReturn(0); 3019 } 3020 3021 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3022 { 3023 PetscErrorCode ierr; 3024 3025 PetscFunctionBegin; 3026 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 3027 PetscFunctionReturn(0); 3028 } 3029 3030 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3031 { 3032 PetscErrorCode ierr; 3033 3034 PetscFunctionBegin; 3035 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3036 PetscFunctionReturn(0); 3037 } 3038 3039 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3040 { 3041 PetscErrorCode ierr; 3042 3043 PetscFunctionBegin; 3044 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 3045 PetscFunctionReturn(0); 3046 } 3047 3048 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3049 { 3050 PetscErrorCode ierr; 3051 3052 PetscFunctionBegin; 3053 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3054 PetscFunctionReturn(0); 3055 } 3056 3057 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3058 { 3059 int i = blockIdx.x*blockDim.x + threadIdx.x; 3060 if (i < n) y[idx[i]] += x[i]; 3061 } 3062 3063 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3064 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 3065 { 3066 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3067 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 3068 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3069 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3070 PetscErrorCode ierr; 3071 cusparseStatus_t stat; 3072 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3073 PetscBool compressed; 3074 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3075 PetscInt nx,ny; 3076 #endif 3077 3078 PetscFunctionBegin; 3079 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3080 if (!a->nonzerorowcnt) { 3081 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 3082 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 3083 PetscFunctionReturn(0); 3084 } 3085 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3086 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3087 if (!trans) { 3088 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3089 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3090 } else { 3091 if (herm || !A->form_explicit_transpose) { 3092 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3093 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3094 } else { 3095 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 3096 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3097 } 3098 } 3099 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3100 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3101 3102 try { 3103 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3104 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 3105 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 3106 3107 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3108 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3109 /* z = A x + beta y. 3110 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3111 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3112 */ 3113 xptr = xarray; 3114 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3115 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3116 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3117 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3118 allocated to accommodate different uses. So we get the length info directly from mat. 3119 */ 3120 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3121 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3122 nx = mat->num_cols; 3123 ny = mat->num_rows; 3124 } 3125 #endif 3126 } else { 3127 /* z = A^T x + beta y 3128 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3129 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3130 */ 3131 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3132 dptr = zarray; 3133 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3134 if (compressed) { /* Scatter x to work vector */ 3135 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3136 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3137 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3138 VecCUDAEqualsReverse()); 3139 } 3140 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3141 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3142 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3143 nx = mat->num_rows; 3144 ny = mat->num_cols; 3145 } 3146 #endif 3147 } 3148 3149 /* csr_spmv does y = alpha op(A) x + beta y */ 3150 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3151 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3152 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3153 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3154 cudaError_t cerr; 3155 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3156 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3157 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3158 matstruct->matDescr, 3159 matstruct->cuSpMV[opA].vecXDescr, beta, 3160 matstruct->cuSpMV[opA].vecYDescr, 3161 cusparse_scalartype, 3162 cusparsestruct->spmvAlg, 3163 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3164 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3165 3166 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3167 } else { 3168 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3169 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3170 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3171 } 3172 3173 stat = cusparseSpMV(cusparsestruct->handle, opA, 3174 matstruct->alpha_one, 3175 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3176 matstruct->cuSpMV[opA].vecXDescr, 3177 beta, 3178 matstruct->cuSpMV[opA].vecYDescr, 3179 cusparse_scalartype, 3180 cusparsestruct->spmvAlg, 3181 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3182 #else 3183 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3184 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3185 mat->num_rows, mat->num_cols, 3186 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3187 mat->values->data().get(), mat->row_offsets->data().get(), 3188 mat->column_indices->data().get(), xptr, beta, 3189 dptr);CHKERRCUSPARSE(stat); 3190 #endif 3191 } else { 3192 if (cusparsestruct->nrows) { 3193 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3194 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3195 #else 3196 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3197 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3198 matstruct->alpha_one, matstruct->descr, hybMat, 3199 xptr, beta, 3200 dptr);CHKERRCUSPARSE(stat); 3201 #endif 3202 } 3203 } 3204 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3205 3206 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3207 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3208 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3209 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3210 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3211 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3212 } 3213 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3214 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3215 } 3216 3217 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3218 if (compressed) { 3219 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3220 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3221 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3222 prevent that. So I just add a ScatterAdd kernel. 3223 */ 3224 #if 0 3225 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3226 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3227 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3228 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3229 VecCUDAPlusEquals()); 3230 #else 3231 PetscInt n = matstruct->cprowIndices->size(); 3232 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3233 #endif 3234 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3235 } 3236 } else { 3237 if (yy && yy != zz) { 3238 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3239 } 3240 } 3241 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3242 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3243 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3244 } catch(char *ex) { 3245 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3246 } 3247 if (yy) { 3248 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3249 } else { 3250 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3251 } 3252 PetscFunctionReturn(0); 3253 } 3254 3255 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3256 { 3257 PetscErrorCode ierr; 3258 3259 PetscFunctionBegin; 3260 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3261 PetscFunctionReturn(0); 3262 } 3263 3264 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3265 { 3266 PetscErrorCode ierr; 3267 PetscObjectState onnz = A->nonzerostate; 3268 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3269 3270 PetscFunctionBegin; 3271 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3272 if (onnz != A->nonzerostate && cusp->deviceMat) { 3273 cudaError_t cerr; 3274 3275 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3276 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3277 cusp->deviceMat = NULL; 3278 } 3279 PetscFunctionReturn(0); 3280 } 3281 3282 /* --------------------------------------------------------------------------------*/ 3283 /*@ 3284 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3285 (the default parallel PETSc format). This matrix will ultimately pushed down 3286 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3287 assembly performance the user should preallocate the matrix storage by setting 3288 the parameter nz (or the array nnz). By setting these parameters accurately, 3289 performance during matrix assembly can be increased by more than a factor of 50. 3290 3291 Collective 3292 3293 Input Parameters: 3294 + comm - MPI communicator, set to PETSC_COMM_SELF 3295 . m - number of rows 3296 . n - number of columns 3297 . nz - number of nonzeros per row (same for all rows) 3298 - nnz - array containing the number of nonzeros in the various rows 3299 (possibly different for each row) or NULL 3300 3301 Output Parameter: 3302 . A - the matrix 3303 3304 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3305 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3306 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3307 3308 Notes: 3309 If nnz is given then nz is ignored 3310 3311 The AIJ format (also called the Yale sparse matrix format or 3312 compressed row storage), is fully compatible with standard Fortran 77 3313 storage. That is, the stored row and column indices can begin at 3314 either one (as in Fortran) or zero. See the users' manual for details. 3315 3316 Specify the preallocated storage with either nz or nnz (not both). 3317 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3318 allocation. For large problems you MUST preallocate memory or you 3319 will get TERRIBLE performance, see the users' manual chapter on matrices. 3320 3321 By default, this format uses inodes (identical nodes) when possible, to 3322 improve numerical efficiency of matrix-vector products and solves. We 3323 search for consecutive rows with the same nonzero structure, thereby 3324 reusing matrix information to achieve increased efficiency. 3325 3326 Level: intermediate 3327 3328 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3329 @*/ 3330 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3331 { 3332 PetscErrorCode ierr; 3333 3334 PetscFunctionBegin; 3335 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3336 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3337 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3338 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3339 PetscFunctionReturn(0); 3340 } 3341 3342 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3343 { 3344 PetscErrorCode ierr; 3345 3346 PetscFunctionBegin; 3347 if (A->factortype == MAT_FACTOR_NONE) { 3348 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3349 } else { 3350 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3351 } 3352 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3353 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3354 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr); 3355 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3356 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3357 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3358 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3359 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3360 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3361 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3362 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3363 PetscFunctionReturn(0); 3364 } 3365 3366 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3367 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3368 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3369 { 3370 PetscErrorCode ierr; 3371 3372 PetscFunctionBegin; 3373 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3374 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3375 PetscFunctionReturn(0); 3376 } 3377 3378 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3379 { 3380 PetscErrorCode ierr; 3381 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3382 Mat_SeqAIJCUSPARSE *cy; 3383 Mat_SeqAIJCUSPARSE *cx; 3384 PetscScalar *ay; 3385 const PetscScalar *ax; 3386 CsrMatrix *csry,*csrx; 3387 3388 PetscFunctionBegin; 3389 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3390 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3391 if (X->ops->axpy != Y->ops->axpy) { 3392 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3393 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3394 PetscFunctionReturn(0); 3395 } 3396 /* if we are here, it means both matrices are bound to GPU */ 3397 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3398 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3399 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3400 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3401 csry = (CsrMatrix*)cy->mat->mat; 3402 csrx = (CsrMatrix*)cx->mat->mat; 3403 /* see if we can turn this into a cublas axpy */ 3404 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3405 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3406 if (eq) { 3407 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3408 } 3409 if (eq) str = SAME_NONZERO_PATTERN; 3410 } 3411 /* spgeam is buggy with one column */ 3412 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3413 3414 if (str == SUBSET_NONZERO_PATTERN) { 3415 cusparseStatus_t stat; 3416 PetscScalar b = 1.0; 3417 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3418 size_t bufferSize; 3419 void *buffer; 3420 cudaError_t cerr; 3421 #endif 3422 3423 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3424 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3425 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3426 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3427 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3428 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3429 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3430 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3431 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3432 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3433 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3434 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3435 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3436 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3437 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3438 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3439 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3440 #else 3441 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3442 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3443 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3444 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3445 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3446 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3447 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3448 #endif 3449 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3450 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3451 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3452 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3453 } else if (str == SAME_NONZERO_PATTERN) { 3454 cublasHandle_t cublasv2handle; 3455 cublasStatus_t berr; 3456 PetscBLASInt one = 1, bnz = 1; 3457 3458 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3459 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3460 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3461 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3462 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3463 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3464 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3465 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3466 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3467 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3468 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3469 } else { 3470 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3471 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3472 } 3473 PetscFunctionReturn(0); 3474 } 3475 3476 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3477 { 3478 PetscErrorCode ierr; 3479 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3480 PetscScalar *ay; 3481 cublasHandle_t cublasv2handle; 3482 cublasStatus_t berr; 3483 PetscBLASInt one = 1, bnz = 1; 3484 3485 PetscFunctionBegin; 3486 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3487 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3488 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3489 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3490 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3491 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3492 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3493 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3494 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3495 PetscFunctionReturn(0); 3496 } 3497 3498 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3499 { 3500 PetscErrorCode ierr; 3501 PetscBool both = PETSC_FALSE; 3502 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3503 3504 PetscFunctionBegin; 3505 if (A->factortype == MAT_FACTOR_NONE) { 3506 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3507 if (spptr->mat) { 3508 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3509 if (matrix->values) { 3510 both = PETSC_TRUE; 3511 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3512 } 3513 } 3514 if (spptr->matTranspose) { 3515 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3516 if (matrix->values) { 3517 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3518 } 3519 } 3520 } 3521 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3522 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3523 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3524 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3525 else A->offloadmask = PETSC_OFFLOAD_CPU; 3526 PetscFunctionReturn(0); 3527 } 3528 3529 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3530 { 3531 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3532 PetscErrorCode ierr; 3533 3534 PetscFunctionBegin; 3535 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3536 if (flg) { 3537 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3538 3539 A->ops->scale = MatScale_SeqAIJ; 3540 A->ops->axpy = MatAXPY_SeqAIJ; 3541 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3542 A->ops->mult = MatMult_SeqAIJ; 3543 A->ops->multadd = MatMultAdd_SeqAIJ; 3544 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3545 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3546 A->ops->multhermitiantranspose = NULL; 3547 A->ops->multhermitiantransposeadd = NULL; 3548 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3549 ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr); 3550 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3551 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3552 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3553 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3554 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3555 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3556 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3557 } else { 3558 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3559 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3560 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3561 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3562 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3563 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3564 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3565 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3566 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3567 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3568 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3569 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3570 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3571 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3572 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3573 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3574 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3575 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3576 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3577 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3578 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3579 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3580 } 3581 A->boundtocpu = flg; 3582 if (flg && a->inode.size) { 3583 a->inode.use = PETSC_TRUE; 3584 } else { 3585 a->inode.use = PETSC_FALSE; 3586 } 3587 PetscFunctionReturn(0); 3588 } 3589 3590 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3591 { 3592 PetscErrorCode ierr; 3593 cusparseStatus_t stat; 3594 Mat B; 3595 3596 PetscFunctionBegin; 3597 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3598 if (reuse == MAT_INITIAL_MATRIX) { 3599 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3600 } else if (reuse == MAT_REUSE_MATRIX) { 3601 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3602 } 3603 B = *newmat; 3604 3605 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3606 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3607 3608 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3609 if (B->factortype == MAT_FACTOR_NONE) { 3610 Mat_SeqAIJCUSPARSE *spptr; 3611 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3612 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3613 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3614 spptr->format = MAT_CUSPARSE_CSR; 3615 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3616 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3617 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3618 #else 3619 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3620 #endif 3621 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3622 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3623 #endif 3624 B->spptr = spptr; 3625 } else { 3626 Mat_SeqAIJCUSPARSETriFactors *spptr; 3627 3628 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3629 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3630 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3631 B->spptr = spptr; 3632 } 3633 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3634 } 3635 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3636 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3637 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3638 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3639 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3640 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3641 3642 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3643 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3644 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3645 #if defined(PETSC_HAVE_HYPRE) 3646 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3647 #endif 3648 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr); 3649 PetscFunctionReturn(0); 3650 } 3651 3652 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3653 { 3654 PetscErrorCode ierr; 3655 3656 PetscFunctionBegin; 3657 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3658 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3659 PetscFunctionReturn(0); 3660 } 3661 3662 /*MC 3663 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3664 3665 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3666 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3667 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3668 3669 Options Database Keys: 3670 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3671 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3672 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3673 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3674 3675 Level: beginner 3676 3677 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3678 M*/ 3679 3680 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3681 3682 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3683 { 3684 PetscErrorCode ierr; 3685 3686 PetscFunctionBegin; 3687 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3688 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3689 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3690 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3691 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3692 3693 PetscFunctionReturn(0); 3694 } 3695 3696 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3697 { 3698 PetscErrorCode ierr; 3699 cusparseStatus_t stat; 3700 3701 PetscFunctionBegin; 3702 if (*cusparsestruct) { 3703 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3704 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3705 delete (*cusparsestruct)->workVector; 3706 delete (*cusparsestruct)->rowoffsets_gpu; 3707 delete (*cusparsestruct)->cooPerm; 3708 delete (*cusparsestruct)->cooPerm_a; 3709 delete (*cusparsestruct)->csr2csc_i; 3710 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3711 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3712 } 3713 PetscFunctionReturn(0); 3714 } 3715 3716 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3717 { 3718 PetscFunctionBegin; 3719 if (*mat) { 3720 delete (*mat)->values; 3721 delete (*mat)->column_indices; 3722 delete (*mat)->row_offsets; 3723 delete *mat; 3724 *mat = 0; 3725 } 3726 PetscFunctionReturn(0); 3727 } 3728 3729 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3730 { 3731 cusparseStatus_t stat; 3732 PetscErrorCode ierr; 3733 3734 PetscFunctionBegin; 3735 if (*trifactor) { 3736 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3737 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3738 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3739 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3740 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3741 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3742 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3743 #endif 3744 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3745 } 3746 PetscFunctionReturn(0); 3747 } 3748 3749 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3750 { 3751 CsrMatrix *mat; 3752 cusparseStatus_t stat; 3753 cudaError_t err; 3754 3755 PetscFunctionBegin; 3756 if (*matstruct) { 3757 if ((*matstruct)->mat) { 3758 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3759 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3760 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3761 #else 3762 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3763 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3764 #endif 3765 } else { 3766 mat = (CsrMatrix*)(*matstruct)->mat; 3767 CsrMatrix_Destroy(&mat); 3768 } 3769 } 3770 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3771 delete (*matstruct)->cprowIndices; 3772 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3773 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3774 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3775 3776 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3777 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3778 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3779 for (int i=0; i<3; i++) { 3780 if (mdata->cuSpMV[i].initialized) { 3781 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3782 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3783 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3784 } 3785 } 3786 #endif 3787 delete *matstruct; 3788 *matstruct = NULL; 3789 } 3790 PetscFunctionReturn(0); 3791 } 3792 3793 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3794 { 3795 PetscErrorCode ierr; 3796 3797 PetscFunctionBegin; 3798 if (*trifactors) { 3799 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3800 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3801 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3802 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3803 delete (*trifactors)->rpermIndices; 3804 delete (*trifactors)->cpermIndices; 3805 delete (*trifactors)->workVector; 3806 (*trifactors)->rpermIndices = NULL; 3807 (*trifactors)->cpermIndices = NULL; 3808 (*trifactors)->workVector = NULL; 3809 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3810 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3811 (*trifactors)->init_dev_prop = PETSC_FALSE; 3812 } 3813 PetscFunctionReturn(0); 3814 } 3815 3816 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3817 { 3818 PetscErrorCode ierr; 3819 cusparseHandle_t handle; 3820 cusparseStatus_t stat; 3821 3822 PetscFunctionBegin; 3823 if (*trifactors) { 3824 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3825 if (handle = (*trifactors)->handle) { 3826 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3827 } 3828 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3829 } 3830 PetscFunctionReturn(0); 3831 } 3832 3833 struct IJCompare 3834 { 3835 __host__ __device__ 3836 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3837 { 3838 if (t1.get<0>() < t2.get<0>()) return true; 3839 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3840 return false; 3841 } 3842 }; 3843 3844 struct IJEqual 3845 { 3846 __host__ __device__ 3847 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3848 { 3849 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3850 return true; 3851 } 3852 }; 3853 3854 struct IJDiff 3855 { 3856 __host__ __device__ 3857 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3858 { 3859 return t1 == t2 ? 0 : 1; 3860 } 3861 }; 3862 3863 struct IJSum 3864 { 3865 __host__ __device__ 3866 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3867 { 3868 return t1||t2; 3869 } 3870 }; 3871 3872 #include <thrust/iterator/discard_iterator.h> 3873 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3874 { 3875 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3876 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3877 THRUSTARRAY *cooPerm_v = NULL; 3878 thrust::device_ptr<const PetscScalar> d_v; 3879 CsrMatrix *matrix; 3880 PetscErrorCode ierr; 3881 PetscInt n; 3882 3883 PetscFunctionBegin; 3884 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3885 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3886 if (!cusp->cooPerm) { 3887 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3888 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3889 PetscFunctionReturn(0); 3890 } 3891 matrix = (CsrMatrix*)cusp->mat->mat; 3892 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3893 if (!v) { 3894 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3895 goto finalize; 3896 } 3897 n = cusp->cooPerm->size(); 3898 if (isCudaMem(v)) { 3899 d_v = thrust::device_pointer_cast(v); 3900 } else { 3901 cooPerm_v = new THRUSTARRAY(n); 3902 cooPerm_v->assign(v,v+n); 3903 d_v = cooPerm_v->data(); 3904 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3905 } 3906 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3907 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3908 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3909 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3910 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3911 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3912 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3913 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3914 */ 3915 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3916 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3917 delete cooPerm_w; 3918 } else { 3919 /* all nonzeros in d_v[] are unique entries */ 3920 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3921 matrix->values->begin())); 3922 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3923 matrix->values->end())); 3924 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3925 } 3926 } else { 3927 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3928 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3929 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3930 } else { 3931 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3932 matrix->values->begin())); 3933 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3934 matrix->values->end())); 3935 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3936 } 3937 } 3938 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3939 finalize: 3940 delete cooPerm_v; 3941 A->offloadmask = PETSC_OFFLOAD_GPU; 3942 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3943 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3944 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3945 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3946 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3947 a->reallocs = 0; 3948 A->info.mallocs += 0; 3949 A->info.nz_unneeded = 0; 3950 A->assembled = A->was_assembled = PETSC_TRUE; 3951 A->num_ass++; 3952 PetscFunctionReturn(0); 3953 } 3954 3955 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3956 { 3957 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3958 PetscErrorCode ierr; 3959 3960 PetscFunctionBegin; 3961 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3962 if (!cusp) PetscFunctionReturn(0); 3963 if (destroy) { 3964 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3965 delete cusp->csr2csc_i; 3966 cusp->csr2csc_i = NULL; 3967 } 3968 A->transupdated = PETSC_FALSE; 3969 PetscFunctionReturn(0); 3970 } 3971 3972 #include <thrust/binary_search.h> 3973 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3974 { 3975 PetscErrorCode ierr; 3976 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3977 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3978 PetscInt cooPerm_n, nzr = 0; 3979 cudaError_t cerr; 3980 3981 PetscFunctionBegin; 3982 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3983 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3984 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3985 if (n != cooPerm_n) { 3986 delete cusp->cooPerm; 3987 delete cusp->cooPerm_a; 3988 cusp->cooPerm = NULL; 3989 cusp->cooPerm_a = NULL; 3990 } 3991 if (n) { 3992 THRUSTINTARRAY d_i(n); 3993 THRUSTINTARRAY d_j(n); 3994 THRUSTINTARRAY ii(A->rmap->n); 3995 3996 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3997 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3998 3999 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 4000 d_i.assign(coo_i,coo_i+n); 4001 d_j.assign(coo_j,coo_j+n); 4002 4003 /* Ex. 4004 n = 6 4005 coo_i = [3,3,1,4,1,4] 4006 coo_j = [3,2,2,5,2,6] 4007 */ 4008 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 4009 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 4010 4011 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4012 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4013 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4014 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 4015 THRUSTINTARRAY w = d_j; 4016 4017 /* 4018 d_i = [1,1,3,3,4,4] 4019 d_j = [2,2,2,3,5,6] 4020 cooPerm = [2,4,1,0,3,5] 4021 */ 4022 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4023 4024 /* 4025 d_i = [1,3,3,4,4,x] 4026 ^ekey 4027 d_j = [2,2,3,5,6,x] 4028 ^nekye 4029 */ 4030 if (nekey == ekey) { /* all entries are unique */ 4031 delete cusp->cooPerm_a; 4032 cusp->cooPerm_a = NULL; 4033 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4034 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4035 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4036 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4037 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 4038 w[0] = 0; 4039 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4040 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 4041 } 4042 thrust::counting_iterator<PetscInt> search_begin(0); 4043 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4044 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4045 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 4046 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4047 4048 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 4049 a->singlemalloc = PETSC_FALSE; 4050 a->free_a = PETSC_TRUE; 4051 a->free_ij = PETSC_TRUE; 4052 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 4053 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 4054 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4055 a->nz = a->maxnz = a->i[A->rmap->n]; 4056 a->rmax = 0; 4057 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 4058 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 4059 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4060 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 4061 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 4062 for (PetscInt i = 0; i < A->rmap->n; i++) { 4063 const PetscInt nnzr = a->i[i+1] - a->i[i]; 4064 nzr += (PetscInt)!!(nnzr); 4065 a->ilen[i] = a->imax[i] = nnzr; 4066 a->rmax = PetscMax(a->rmax,nnzr); 4067 } 4068 a->nonzerorowcnt = nzr; 4069 A->preallocated = PETSC_TRUE; 4070 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 4071 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 4072 } else { 4073 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 4074 } 4075 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 4076 4077 /* We want to allocate the CUSPARSE struct for matvec now. 4078 The code is so convoluted now that I prefer to copy zeros */ 4079 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 4080 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 4081 A->offloadmask = PETSC_OFFLOAD_CPU; 4082 A->nonzerostate++; 4083 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4084 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 4085 4086 A->assembled = PETSC_FALSE; 4087 A->was_assembled = PETSC_FALSE; 4088 PetscFunctionReturn(0); 4089 } 4090 4091 /*@C 4092 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4093 4094 Not collective 4095 4096 Input Parameters: 4097 + A - the matrix 4098 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4099 4100 Output Parameters: 4101 + ia - the CSR row pointers 4102 - ja - the CSR column indices 4103 4104 Level: developer 4105 4106 Notes: 4107 When compressed is true, the CSR structure does not contain empty rows 4108 4109 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 4110 @*/ 4111 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4112 { 4113 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4114 CsrMatrix *csr; 4115 PetscErrorCode ierr; 4116 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4117 4118 PetscFunctionBegin; 4119 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4120 if (!i || !j) PetscFunctionReturn(0); 4121 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4122 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4123 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4124 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4125 csr = (CsrMatrix*)cusp->mat->mat; 4126 if (i) { 4127 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4128 if (!cusp->rowoffsets_gpu) { 4129 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4130 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4131 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4132 } 4133 *i = cusp->rowoffsets_gpu->data().get(); 4134 } else *i = csr->row_offsets->data().get(); 4135 } 4136 if (j) *j = csr->column_indices->data().get(); 4137 PetscFunctionReturn(0); 4138 } 4139 4140 /*@C 4141 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4142 4143 Not collective 4144 4145 Input Parameters: 4146 + A - the matrix 4147 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4148 4149 Output Parameters: 4150 + ia - the CSR row pointers 4151 - ja - the CSR column indices 4152 4153 Level: developer 4154 4155 .seealso: MatSeqAIJCUSPARSEGetIJ() 4156 @*/ 4157 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4158 { 4159 PetscFunctionBegin; 4160 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4161 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4162 if (i) *i = NULL; 4163 if (j) *j = NULL; 4164 PetscFunctionReturn(0); 4165 } 4166 4167 /*@C 4168 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4169 4170 Not Collective 4171 4172 Input Parameter: 4173 . A - a MATSEQAIJCUSPARSE matrix 4174 4175 Output Parameter: 4176 . a - pointer to the device data 4177 4178 Level: developer 4179 4180 Notes: may trigger host-device copies if up-to-date matrix data is on host 4181 4182 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4183 @*/ 4184 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4185 { 4186 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4187 CsrMatrix *csr; 4188 PetscErrorCode ierr; 4189 4190 PetscFunctionBegin; 4191 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4192 PetscValidPointer(a,2); 4193 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4194 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4195 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4196 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4197 csr = (CsrMatrix*)cusp->mat->mat; 4198 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4199 *a = csr->values->data().get(); 4200 PetscFunctionReturn(0); 4201 } 4202 4203 /*@C 4204 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4205 4206 Not Collective 4207 4208 Input Parameter: 4209 . A - a MATSEQAIJCUSPARSE matrix 4210 4211 Output Parameter: 4212 . a - pointer to the device data 4213 4214 Level: developer 4215 4216 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4217 @*/ 4218 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4219 { 4220 PetscFunctionBegin; 4221 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4222 PetscValidPointer(a,2); 4223 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4224 *a = NULL; 4225 PetscFunctionReturn(0); 4226 } 4227 4228 /*@C 4229 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4230 4231 Not Collective 4232 4233 Input Parameter: 4234 . A - a MATSEQAIJCUSPARSE matrix 4235 4236 Output Parameter: 4237 . a - pointer to the device data 4238 4239 Level: developer 4240 4241 Notes: may trigger host-device copies if up-to-date matrix data is on host 4242 4243 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4244 @*/ 4245 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4246 { 4247 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4248 CsrMatrix *csr; 4249 PetscErrorCode ierr; 4250 4251 PetscFunctionBegin; 4252 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4253 PetscValidPointer(a,2); 4254 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4255 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4256 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4257 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4258 csr = (CsrMatrix*)cusp->mat->mat; 4259 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4260 *a = csr->values->data().get(); 4261 A->offloadmask = PETSC_OFFLOAD_GPU; 4262 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4263 PetscFunctionReturn(0); 4264 } 4265 /*@C 4266 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4267 4268 Not Collective 4269 4270 Input Parameter: 4271 . A - a MATSEQAIJCUSPARSE matrix 4272 4273 Output Parameter: 4274 . a - pointer to the device data 4275 4276 Level: developer 4277 4278 .seealso: MatSeqAIJCUSPARSEGetArray() 4279 @*/ 4280 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4281 { 4282 PetscErrorCode ierr; 4283 4284 PetscFunctionBegin; 4285 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4286 PetscValidPointer(a,2); 4287 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4288 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4289 *a = NULL; 4290 PetscFunctionReturn(0); 4291 } 4292 4293 /*@C 4294 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4295 4296 Not Collective 4297 4298 Input Parameter: 4299 . A - a MATSEQAIJCUSPARSE matrix 4300 4301 Output Parameter: 4302 . a - pointer to the device data 4303 4304 Level: developer 4305 4306 Notes: does not trigger host-device copies and flags data validity on the GPU 4307 4308 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4309 @*/ 4310 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4311 { 4312 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4313 CsrMatrix *csr; 4314 PetscErrorCode ierr; 4315 4316 PetscFunctionBegin; 4317 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4318 PetscValidPointer(a,2); 4319 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4320 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4321 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4322 csr = (CsrMatrix*)cusp->mat->mat; 4323 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4324 *a = csr->values->data().get(); 4325 A->offloadmask = PETSC_OFFLOAD_GPU; 4326 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4327 PetscFunctionReturn(0); 4328 } 4329 4330 /*@C 4331 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4332 4333 Not Collective 4334 4335 Input Parameter: 4336 . A - a MATSEQAIJCUSPARSE matrix 4337 4338 Output Parameter: 4339 . a - pointer to the device data 4340 4341 Level: developer 4342 4343 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4344 @*/ 4345 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4346 { 4347 PetscErrorCode ierr; 4348 4349 PetscFunctionBegin; 4350 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4351 PetscValidPointer(a,2); 4352 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4353 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4354 *a = NULL; 4355 PetscFunctionReturn(0); 4356 } 4357 4358 struct IJCompare4 4359 { 4360 __host__ __device__ 4361 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4362 { 4363 if (t1.get<0>() < t2.get<0>()) return true; 4364 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4365 return false; 4366 } 4367 }; 4368 4369 struct Shift 4370 { 4371 int _shift; 4372 4373 Shift(int shift) : _shift(shift) {} 4374 __host__ __device__ 4375 inline int operator() (const int &c) 4376 { 4377 return c + _shift; 4378 } 4379 }; 4380 4381 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4382 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4383 { 4384 PetscErrorCode ierr; 4385 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4386 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4387 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4388 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4389 PetscInt Annz,Bnnz; 4390 cusparseStatus_t stat; 4391 PetscInt i,m,n,zero = 0; 4392 cudaError_t cerr; 4393 4394 PetscFunctionBegin; 4395 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4396 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4397 PetscValidPointer(C,4); 4398 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4399 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4400 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4401 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4402 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4403 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4404 if (reuse == MAT_INITIAL_MATRIX) { 4405 m = A->rmap->n; 4406 n = A->cmap->n + B->cmap->n; 4407 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4408 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4409 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4410 c = (Mat_SeqAIJ*)(*C)->data; 4411 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4412 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4413 Ccsr = new CsrMatrix; 4414 Cmat->cprowIndices = NULL; 4415 c->compressedrow.use = PETSC_FALSE; 4416 c->compressedrow.nrows = 0; 4417 c->compressedrow.i = NULL; 4418 c->compressedrow.rindex = NULL; 4419 Ccusp->workVector = NULL; 4420 Ccusp->nrows = m; 4421 Ccusp->mat = Cmat; 4422 Ccusp->mat->mat = Ccsr; 4423 Ccsr->num_rows = m; 4424 Ccsr->num_cols = n; 4425 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4426 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4427 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4428 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4429 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4430 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4431 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4432 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4433 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4434 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4435 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4436 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4437 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4438 4439 Acsr = (CsrMatrix*)Acusp->mat->mat; 4440 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4441 Annz = (PetscInt)Acsr->column_indices->size(); 4442 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4443 c->nz = Annz + Bnnz; 4444 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4445 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4446 Ccsr->values = new THRUSTARRAY(c->nz); 4447 Ccsr->num_entries = c->nz; 4448 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4449 if (c->nz) { 4450 auto Acoo = new THRUSTINTARRAY32(Annz); 4451 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4452 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4453 THRUSTINTARRAY32 *Aroff,*Broff; 4454 4455 if (a->compressedrow.use) { /* need full row offset */ 4456 if (!Acusp->rowoffsets_gpu) { 4457 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4458 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4459 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4460 } 4461 Aroff = Acusp->rowoffsets_gpu; 4462 } else Aroff = Acsr->row_offsets; 4463 if (b->compressedrow.use) { /* need full row offset */ 4464 if (!Bcusp->rowoffsets_gpu) { 4465 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4466 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4467 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4468 } 4469 Broff = Bcusp->rowoffsets_gpu; 4470 } else Broff = Bcsr->row_offsets; 4471 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4472 stat = cusparseXcsr2coo(Acusp->handle, 4473 Aroff->data().get(), 4474 Annz, 4475 m, 4476 Acoo->data().get(), 4477 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4478 stat = cusparseXcsr2coo(Bcusp->handle, 4479 Broff->data().get(), 4480 Bnnz, 4481 m, 4482 Bcoo->data().get(), 4483 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4484 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4485 auto Aperm = thrust::make_constant_iterator(1); 4486 auto Bperm = thrust::make_constant_iterator(0); 4487 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4488 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4489 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4490 #else 4491 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4492 auto Bcib = Bcsr->column_indices->begin(); 4493 auto Bcie = Bcsr->column_indices->end(); 4494 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4495 #endif 4496 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4497 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4498 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4499 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4500 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4501 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4502 auto p1 = Ccusp->cooPerm->begin(); 4503 auto p2 = Ccusp->cooPerm->begin(); 4504 thrust::advance(p2,Annz); 4505 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4506 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4507 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4508 #endif 4509 auto cci = thrust::make_counting_iterator(zero); 4510 auto cce = thrust::make_counting_iterator(c->nz); 4511 #if 0 //Errors on SUMMIT cuda 11.1.0 4512 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4513 #else 4514 auto pred = thrust::identity<int>(); 4515 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4516 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4517 #endif 4518 stat = cusparseXcoo2csr(Ccusp->handle, 4519 Ccoo->data().get(), 4520 c->nz, 4521 m, 4522 Ccsr->row_offsets->data().get(), 4523 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4524 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4525 delete wPerm; 4526 delete Acoo; 4527 delete Bcoo; 4528 delete Ccoo; 4529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4530 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4531 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4532 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4533 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4534 #endif 4535 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4536 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4537 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4538 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4539 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4540 CsrMatrix *CcsrT = new CsrMatrix; 4541 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4542 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4543 4544 (*C)->form_explicit_transpose = PETSC_TRUE; 4545 (*C)->transupdated = PETSC_TRUE; 4546 Ccusp->rowoffsets_gpu = NULL; 4547 CmatT->cprowIndices = NULL; 4548 CmatT->mat = CcsrT; 4549 CcsrT->num_rows = n; 4550 CcsrT->num_cols = m; 4551 CcsrT->num_entries = c->nz; 4552 4553 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4554 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4555 CcsrT->values = new THRUSTARRAY(c->nz); 4556 4557 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4558 auto rT = CcsrT->row_offsets->begin(); 4559 if (AT) { 4560 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4561 thrust::advance(rT,-1); 4562 } 4563 if (BT) { 4564 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4565 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4566 thrust::copy(titb,tite,rT); 4567 } 4568 auto cT = CcsrT->column_indices->begin(); 4569 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4570 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4571 auto vT = CcsrT->values->begin(); 4572 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4573 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4574 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4575 4576 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4577 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4578 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4579 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4580 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4581 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4582 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4583 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4584 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4585 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4586 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4587 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4588 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4589 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4590 #endif 4591 Ccusp->matTranspose = CmatT; 4592 } 4593 } 4594 4595 c->singlemalloc = PETSC_FALSE; 4596 c->free_a = PETSC_TRUE; 4597 c->free_ij = PETSC_TRUE; 4598 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4599 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4600 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4601 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4602 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4603 ii = *Ccsr->row_offsets; 4604 jj = *Ccsr->column_indices; 4605 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4606 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4607 } else { 4608 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4609 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4610 } 4611 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4612 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4613 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4614 c->maxnz = c->nz; 4615 c->nonzerorowcnt = 0; 4616 c->rmax = 0; 4617 for (i = 0; i < m; i++) { 4618 const PetscInt nn = c->i[i+1] - c->i[i]; 4619 c->ilen[i] = c->imax[i] = nn; 4620 c->nonzerorowcnt += (PetscInt)!!nn; 4621 c->rmax = PetscMax(c->rmax,nn); 4622 } 4623 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4624 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4625 (*C)->nonzerostate++; 4626 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4627 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4628 Ccusp->nonzerostate = (*C)->nonzerostate; 4629 (*C)->preallocated = PETSC_TRUE; 4630 } else { 4631 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4632 c = (Mat_SeqAIJ*)(*C)->data; 4633 if (c->nz) { 4634 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4635 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4636 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4637 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4638 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4639 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4640 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4641 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4642 Acsr = (CsrMatrix*)Acusp->mat->mat; 4643 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4644 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4645 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4646 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4647 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4648 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4649 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4650 auto pmid = Ccusp->cooPerm->begin(); 4651 thrust::advance(pmid,Acsr->num_entries); 4652 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4653 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4654 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4655 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4656 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4657 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4658 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4659 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4660 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4661 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4662 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4663 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4664 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4665 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4666 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4667 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4668 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4669 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4670 auto vT = CcsrT->values->begin(); 4671 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4672 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4673 (*C)->transupdated = PETSC_TRUE; 4674 } 4675 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4676 } 4677 } 4678 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4679 (*C)->assembled = PETSC_TRUE; 4680 (*C)->was_assembled = PETSC_FALSE; 4681 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4682 PetscFunctionReturn(0); 4683 } 4684 4685 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4686 { 4687 PetscErrorCode ierr; 4688 bool dmem; 4689 const PetscScalar *av; 4690 cudaError_t cerr; 4691 4692 PetscFunctionBegin; 4693 dmem = isCudaMem(v); 4694 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4695 if (n && idx) { 4696 THRUSTINTARRAY widx(n); 4697 widx.assign(idx,idx+n); 4698 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4699 4700 THRUSTARRAY *w = NULL; 4701 thrust::device_ptr<PetscScalar> dv; 4702 if (dmem) { 4703 dv = thrust::device_pointer_cast(v); 4704 } else { 4705 w = new THRUSTARRAY(n); 4706 dv = w->data(); 4707 } 4708 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4709 4710 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4711 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4712 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4713 if (w) { 4714 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4715 } 4716 delete w; 4717 } else { 4718 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4719 } 4720 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4721 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4722 PetscFunctionReturn(0); 4723 } 4724