1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 #include <thrust/async/for_each.h> 17 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 18 #include <cooperative_groups.h> 19 #endif 20 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 21 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 22 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 23 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 24 25 typedef enum { 26 CUSPARSE_MV_ALG_DEFAULT = 0, 27 CUSPARSE_COOMV_ALG = 1, 28 CUSPARSE_CSRMV_ALG1 = 2, 29 CUSPARSE_CSRMV_ALG2 = 3 30 } cusparseSpMVAlg_t; 31 32 typedef enum { 33 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 34 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 35 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 36 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 37 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 38 CUSPARSE_SPMM_ALG_DEFAULT = 0, 39 CUSPARSE_SPMM_COO_ALG1 = 1, 40 CUSPARSE_SPMM_COO_ALG2 = 2, 41 CUSPARSE_SPMM_COO_ALG3 = 3, 42 CUSPARSE_SPMM_COO_ALG4 = 5, 43 CUSPARSE_SPMM_CSR_ALG1 = 4, 44 CUSPARSE_SPMM_CSR_ALG2 = 6, 45 } cusparseSpMMAlg_t; 46 47 typedef enum { 48 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 49 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 50 } cusparseCsr2CscAlg_t; 51 */ 52 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 53 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 54 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 55 #endif 56 57 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 58 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 59 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 60 61 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*); 62 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*); 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 88 89 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 92 93 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 94 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 95 96 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 97 98 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 99 { 100 cusparseStatus_t stat; 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 102 103 PetscFunctionBegin; 104 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 105 cusparsestruct->stream = stream; 106 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 107 PetscFunctionReturn(0); 108 } 109 110 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 111 { 112 cusparseStatus_t stat; 113 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 114 115 PetscFunctionBegin; 116 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 117 if (cusparsestruct->handle != handle) { 118 if (cusparsestruct->handle) { 119 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 120 } 121 cusparsestruct->handle = handle; 122 } 123 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 128 { 129 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 130 PetscBool flg; 131 PetscErrorCode ierr; 132 133 PetscFunctionBegin; 134 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 135 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 136 if (cusparsestruct->handle) cusparsestruct->handle = 0; 137 PetscFunctionReturn(0); 138 } 139 140 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 141 { 142 PetscFunctionBegin; 143 *type = MATSOLVERCUSPARSE; 144 PetscFunctionReturn(0); 145 } 146 147 /*MC 148 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 149 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 150 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 151 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 152 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 153 algorithms are not recommended. This class does NOT support direct solver operations. 154 155 Level: beginner 156 157 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 M*/ 159 160 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 161 { 162 PetscErrorCode ierr; 163 PetscInt n = A->rmap->n; 164 165 PetscFunctionBegin; 166 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 167 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 168 (*B)->factortype = ftype; 169 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 170 171 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 172 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 173 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 177 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 178 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 179 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 180 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 181 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 182 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 183 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 184 185 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 186 (*B)->canuseordering = PETSC_TRUE; 187 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 188 PetscFunctionReturn(0); 189 } 190 191 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 192 { 193 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 194 195 PetscFunctionBegin; 196 switch (op) { 197 case MAT_CUSPARSE_MULT: 198 cusparsestruct->format = format; 199 break; 200 case MAT_CUSPARSE_ALL: 201 cusparsestruct->format = format; 202 break; 203 default: 204 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 205 } 206 PetscFunctionReturn(0); 207 } 208 209 /*@ 210 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 211 operation. Only the MatMult operation can use different GPU storage formats 212 for MPIAIJCUSPARSE matrices. 213 Not Collective 214 215 Input Parameters: 216 + A - Matrix of type SEQAIJCUSPARSE 217 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 218 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 219 220 Output Parameter: 221 222 Level: intermediate 223 224 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 225 @*/ 226 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 227 { 228 PetscErrorCode ierr; 229 230 PetscFunctionBegin; 231 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 232 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 233 PetscFunctionReturn(0); 234 } 235 236 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 237 { 238 PetscErrorCode ierr; 239 240 PetscFunctionBegin; 241 switch (op) { 242 case MAT_FORM_EXPLICIT_TRANSPOSE: 243 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 244 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 245 A->form_explicit_transpose = flg; 246 break; 247 default: 248 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 249 break; 250 } 251 PetscFunctionReturn(0); 252 } 253 254 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 255 256 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 257 { 258 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 259 IS isrow = b->row,iscol = b->col; 260 PetscBool row_identity,col_identity; 261 PetscErrorCode ierr; 262 263 PetscFunctionBegin; 264 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 265 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 266 B->offloadmask = PETSC_OFFLOAD_CPU; 267 /* determine which version of MatSolve needs to be used. */ 268 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 269 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 270 if (row_identity && col_identity) { 271 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 272 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 273 B->ops->matsolve = NULL; 274 B->ops->matsolvetranspose = NULL; 275 } else { 276 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 277 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 278 B->ops->matsolve = NULL; 279 B->ops->matsolvetranspose = NULL; 280 } 281 282 /* get the triangular factors */ 283 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 284 PetscFunctionReturn(0); 285 } 286 287 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 288 { 289 PetscErrorCode ierr; 290 MatCUSPARSEStorageFormat format; 291 PetscBool flg; 292 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 293 294 PetscFunctionBegin; 295 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 296 if (A->factortype == MAT_FACTOR_NONE) { 297 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 298 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 299 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 300 301 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 302 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 303 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 304 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 305 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 306 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 307 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 308 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 309 310 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 311 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 312 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 313 314 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 315 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 316 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 317 #endif 318 } 319 ierr = PetscOptionsTail();CHKERRQ(ierr); 320 PetscFunctionReturn(0); 321 } 322 323 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 324 { 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 326 PetscErrorCode ierr; 327 328 PetscFunctionBegin; 329 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 330 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 331 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 332 PetscFunctionReturn(0); 333 } 334 335 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 336 { 337 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 338 PetscErrorCode ierr; 339 340 PetscFunctionBegin; 341 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 342 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 343 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 344 PetscFunctionReturn(0); 345 } 346 347 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 348 { 349 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 350 PetscErrorCode ierr; 351 352 PetscFunctionBegin; 353 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 354 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 355 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 356 PetscFunctionReturn(0); 357 } 358 359 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 360 { 361 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 362 PetscErrorCode ierr; 363 364 PetscFunctionBegin; 365 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 366 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 367 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 368 PetscFunctionReturn(0); 369 } 370 371 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 372 { 373 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 374 PetscInt n = A->rmap->n; 375 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 376 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 377 cusparseStatus_t stat; 378 const PetscInt *ai = a->i,*aj = a->j,*vi; 379 const MatScalar *aa = a->a,*v; 380 PetscInt *AiLo, *AjLo; 381 PetscInt i,nz, nzLower, offset, rowOffset; 382 PetscErrorCode ierr; 383 cudaError_t cerr; 384 385 PetscFunctionBegin; 386 if (!n) PetscFunctionReturn(0); 387 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 388 try { 389 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 390 nzLower=n+ai[n]-ai[1]; 391 if (!loTriFactor) { 392 PetscScalar *AALo; 393 394 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 395 396 /* Allocate Space for the lower triangular matrix */ 397 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 398 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 399 400 /* Fill the lower triangular matrix */ 401 AiLo[0] = (PetscInt) 0; 402 AiLo[n] = nzLower; 403 AjLo[0] = (PetscInt) 0; 404 AALo[0] = (MatScalar) 1.0; 405 v = aa; 406 vi = aj; 407 offset = 1; 408 rowOffset= 1; 409 for (i=1; i<n; i++) { 410 nz = ai[i+1] - ai[i]; 411 /* additional 1 for the term on the diagonal */ 412 AiLo[i] = rowOffset; 413 rowOffset += nz+1; 414 415 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 416 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 417 418 offset += nz; 419 AjLo[offset] = (PetscInt) i; 420 AALo[offset] = (MatScalar) 1.0; 421 offset += 1; 422 423 v += nz; 424 vi += nz; 425 } 426 427 /* allocate space for the triangular factor information */ 428 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 429 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 430 /* Create the matrix description */ 431 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 432 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 433 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 434 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 435 #else 436 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 437 #endif 438 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 439 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 440 441 /* set the operation */ 442 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 443 444 /* set the matrix */ 445 loTriFactor->csrMat = new CsrMatrix; 446 loTriFactor->csrMat->num_rows = n; 447 loTriFactor->csrMat->num_cols = n; 448 loTriFactor->csrMat->num_entries = nzLower; 449 450 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 451 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 452 453 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 454 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 455 456 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 457 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 458 459 /* Create the solve analysis information */ 460 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 461 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 462 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 463 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 464 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 465 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 466 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 467 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 468 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 469 #endif 470 471 /* perform the solve analysis */ 472 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 473 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 474 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 475 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 476 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 477 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 478 #endif 479 );CHKERRCUSPARSE(stat); 480 cerr = WaitForCUDA();CHKERRCUDA(cerr); 481 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 482 483 /* assign the pointer */ 484 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 485 loTriFactor->AA_h = AALo; 486 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 487 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 488 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 489 } else { /* update values only */ 490 if (!loTriFactor->AA_h) { 491 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 492 } 493 /* Fill the lower triangular matrix */ 494 loTriFactor->AA_h[0] = 1.0; 495 v = aa; 496 vi = aj; 497 offset = 1; 498 for (i=1; i<n; i++) { 499 nz = ai[i+1] - ai[i]; 500 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 501 offset += nz; 502 loTriFactor->AA_h[offset] = 1.0; 503 offset += 1; 504 v += nz; 505 } 506 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 507 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 508 } 509 } catch(char *ex) { 510 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 511 } 512 } 513 PetscFunctionReturn(0); 514 } 515 516 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 517 { 518 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 519 PetscInt n = A->rmap->n; 520 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 521 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 522 cusparseStatus_t stat; 523 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 524 const MatScalar *aa = a->a,*v; 525 PetscInt *AiUp, *AjUp; 526 PetscInt i,nz, nzUpper, offset; 527 PetscErrorCode ierr; 528 cudaError_t cerr; 529 530 PetscFunctionBegin; 531 if (!n) PetscFunctionReturn(0); 532 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 533 try { 534 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 535 nzUpper = adiag[0]-adiag[n]; 536 if (!upTriFactor) { 537 PetscScalar *AAUp; 538 539 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 540 541 /* Allocate Space for the upper triangular matrix */ 542 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 543 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 544 545 /* Fill the upper triangular matrix */ 546 AiUp[0]=(PetscInt) 0; 547 AiUp[n]=nzUpper; 548 offset = nzUpper; 549 for (i=n-1; i>=0; i--) { 550 v = aa + adiag[i+1] + 1; 551 vi = aj + adiag[i+1] + 1; 552 553 /* number of elements NOT on the diagonal */ 554 nz = adiag[i] - adiag[i+1]-1; 555 556 /* decrement the offset */ 557 offset -= (nz+1); 558 559 /* first, set the diagonal elements */ 560 AjUp[offset] = (PetscInt) i; 561 AAUp[offset] = (MatScalar)1./v[nz]; 562 AiUp[i] = AiUp[i+1] - (nz+1); 563 564 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 565 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 566 } 567 568 /* allocate space for the triangular factor information */ 569 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 570 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 571 572 /* Create the matrix description */ 573 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 574 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 575 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 576 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 577 #else 578 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 579 #endif 580 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 581 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 582 583 /* set the operation */ 584 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 585 586 /* set the matrix */ 587 upTriFactor->csrMat = new CsrMatrix; 588 upTriFactor->csrMat->num_rows = n; 589 upTriFactor->csrMat->num_cols = n; 590 upTriFactor->csrMat->num_entries = nzUpper; 591 592 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 593 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 594 595 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 596 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 597 598 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 599 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 600 601 /* Create the solve analysis information */ 602 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 603 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 604 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 605 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 606 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 607 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 608 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 609 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 610 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 611 #endif 612 613 /* perform the solve analysis */ 614 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 615 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 616 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 617 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 618 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 619 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 620 #endif 621 );CHKERRCUSPARSE(stat); 622 cerr = WaitForCUDA();CHKERRCUDA(cerr); 623 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 624 625 /* assign the pointer */ 626 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 627 upTriFactor->AA_h = AAUp; 628 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 629 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 630 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 631 } else { 632 if (!upTriFactor->AA_h) { 633 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 634 } 635 /* Fill the upper triangular matrix */ 636 offset = nzUpper; 637 for (i=n-1; i>=0; i--) { 638 v = aa + adiag[i+1] + 1; 639 640 /* number of elements NOT on the diagonal */ 641 nz = adiag[i] - adiag[i+1]-1; 642 643 /* decrement the offset */ 644 offset -= (nz+1); 645 646 /* first, set the diagonal elements */ 647 upTriFactor->AA_h[offset] = 1./v[nz]; 648 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 649 } 650 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 651 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 652 } 653 } catch(char *ex) { 654 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 655 } 656 } 657 PetscFunctionReturn(0); 658 } 659 660 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 661 { 662 PetscErrorCode ierr; 663 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 664 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 665 IS isrow = a->row,iscol = a->icol; 666 PetscBool row_identity,col_identity; 667 PetscInt n = A->rmap->n; 668 669 PetscFunctionBegin; 670 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 671 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 672 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 673 674 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 675 cusparseTriFactors->nnz=a->nz; 676 677 A->offloadmask = PETSC_OFFLOAD_BOTH; 678 /* lower triangular indices */ 679 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 680 if (!row_identity && !cusparseTriFactors->rpermIndices) { 681 const PetscInt *r; 682 683 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 684 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 685 cusparseTriFactors->rpermIndices->assign(r, r+n); 686 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 687 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 688 } 689 690 /* upper triangular indices */ 691 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 692 if (!col_identity && !cusparseTriFactors->cpermIndices) { 693 const PetscInt *c; 694 695 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 696 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 697 cusparseTriFactors->cpermIndices->assign(c, c+n); 698 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 699 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 700 } 701 PetscFunctionReturn(0); 702 } 703 704 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 705 { 706 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 707 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 708 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 709 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 710 cusparseStatus_t stat; 711 PetscErrorCode ierr; 712 cudaError_t cerr; 713 PetscInt *AiUp, *AjUp; 714 PetscScalar *AAUp; 715 PetscScalar *AALo; 716 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 717 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 718 const PetscInt *ai = b->i,*aj = b->j,*vj; 719 const MatScalar *aa = b->a,*v; 720 721 PetscFunctionBegin; 722 if (!n) PetscFunctionReturn(0); 723 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 724 try { 725 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 726 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 727 if (!upTriFactor && !loTriFactor) { 728 /* Allocate Space for the upper triangular matrix */ 729 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 730 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 731 732 /* Fill the upper triangular matrix */ 733 AiUp[0]=(PetscInt) 0; 734 AiUp[n]=nzUpper; 735 offset = 0; 736 for (i=0; i<n; i++) { 737 /* set the pointers */ 738 v = aa + ai[i]; 739 vj = aj + ai[i]; 740 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 741 742 /* first, set the diagonal elements */ 743 AjUp[offset] = (PetscInt) i; 744 AAUp[offset] = (MatScalar)1.0/v[nz]; 745 AiUp[i] = offset; 746 AALo[offset] = (MatScalar)1.0/v[nz]; 747 748 offset+=1; 749 if (nz>0) { 750 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 751 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 752 for (j=offset; j<offset+nz; j++) { 753 AAUp[j] = -AAUp[j]; 754 AALo[j] = AAUp[j]/v[nz]; 755 } 756 offset+=nz; 757 } 758 } 759 760 /* allocate space for the triangular factor information */ 761 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 762 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 763 764 /* Create the matrix description */ 765 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 766 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 767 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 768 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 769 #else 770 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 771 #endif 772 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 773 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 774 775 /* set the matrix */ 776 upTriFactor->csrMat = new CsrMatrix; 777 upTriFactor->csrMat->num_rows = A->rmap->n; 778 upTriFactor->csrMat->num_cols = A->cmap->n; 779 upTriFactor->csrMat->num_entries = a->nz; 780 781 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 782 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 783 784 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 785 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 786 787 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 788 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 789 790 /* set the operation */ 791 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 792 793 /* Create the solve analysis information */ 794 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 795 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 796 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 797 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 798 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 799 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 800 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 801 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 802 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 803 #endif 804 805 /* perform the solve analysis */ 806 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 807 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 808 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 809 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 810 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 811 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 812 #endif 813 );CHKERRCUSPARSE(stat); 814 cerr = WaitForCUDA();CHKERRCUDA(cerr); 815 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 816 817 /* assign the pointer */ 818 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 819 820 /* allocate space for the triangular factor information */ 821 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 822 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 826 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 827 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 828 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 829 #else 830 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 831 #endif 832 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 833 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 834 835 /* set the operation */ 836 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 837 838 /* set the matrix */ 839 loTriFactor->csrMat = new CsrMatrix; 840 loTriFactor->csrMat->num_rows = A->rmap->n; 841 loTriFactor->csrMat->num_cols = A->cmap->n; 842 loTriFactor->csrMat->num_entries = a->nz; 843 844 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 845 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 846 847 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 848 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 849 850 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 851 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 852 853 /* Create the solve analysis information */ 854 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 855 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 856 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 857 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 858 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 859 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 860 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 861 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 862 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 863 #endif 864 865 /* perform the solve analysis */ 866 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 867 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 868 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 869 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 870 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 871 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 872 #endif 873 );CHKERRCUSPARSE(stat); 874 cerr = WaitForCUDA();CHKERRCUDA(cerr); 875 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 876 877 /* assign the pointer */ 878 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 879 880 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 881 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 882 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 883 } else { 884 /* Fill the upper triangular matrix */ 885 offset = 0; 886 for (i=0; i<n; i++) { 887 /* set the pointers */ 888 v = aa + ai[i]; 889 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 890 891 /* first, set the diagonal elements */ 892 AAUp[offset] = 1.0/v[nz]; 893 AALo[offset] = 1.0/v[nz]; 894 895 offset+=1; 896 if (nz>0) { 897 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 898 for (j=offset; j<offset+nz; j++) { 899 AAUp[j] = -AAUp[j]; 900 AALo[j] = AAUp[j]/v[nz]; 901 } 902 offset+=nz; 903 } 904 } 905 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 906 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 908 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 909 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 910 } 911 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 912 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 913 } catch(char *ex) { 914 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 915 } 916 } 917 PetscFunctionReturn(0); 918 } 919 920 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 921 { 922 PetscErrorCode ierr; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 924 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 925 IS ip = a->row; 926 PetscBool perm_identity; 927 PetscInt n = A->rmap->n; 928 929 PetscFunctionBegin; 930 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 931 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 932 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 933 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 934 935 A->offloadmask = PETSC_OFFLOAD_BOTH; 936 937 /* lower triangular indices */ 938 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 939 if (!perm_identity) { 940 IS iip; 941 const PetscInt *irip,*rip; 942 943 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 944 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 945 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 946 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 947 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 948 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 949 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 950 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 951 ierr = ISDestroy(&iip);CHKERRQ(ierr); 952 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 953 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 954 } 955 PetscFunctionReturn(0); 956 } 957 958 #define CHECK_LAUNCH_ERROR() \ 959 do { \ 960 /* Check synchronous errors, i.e. pre-launch */ \ 961 cudaError_t err = cudaGetLastError(); \ 962 if (cudaSuccess != err) { \ 963 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \ 964 } \ 965 /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 966 err = cudaDeviceSynchronize(); \ 967 if (cudaSuccess != err) { \ 968 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \ 969 } \ 970 } while (0) 971 972 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 973 { 974 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 975 IS ip = b->row; 976 PetscBool perm_identity; 977 PetscErrorCode ierr; 978 979 PetscFunctionBegin; 980 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 981 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 982 B->offloadmask = PETSC_OFFLOAD_CPU; 983 /* determine which version of MatSolve needs to be used. */ 984 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 985 if (perm_identity) { 986 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 987 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 988 B->ops->matsolve = NULL; 989 B->ops->matsolvetranspose = NULL; 990 } else { 991 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 992 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 993 B->ops->matsolve = NULL; 994 B->ops->matsolvetranspose = NULL; 995 } 996 997 /* get the triangular factors */ 998 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 999 PetscFunctionReturn(0); 1000 } 1001 1002 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1003 { 1004 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1005 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1006 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1007 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1008 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1009 cusparseStatus_t stat; 1010 cusparseIndexBase_t indexBase; 1011 cusparseMatrixType_t matrixType; 1012 cusparseFillMode_t fillMode; 1013 cusparseDiagType_t diagType; 1014 cudaError_t cerr; 1015 PetscErrorCode ierr; 1016 1017 PetscFunctionBegin; 1018 /* allocate space for the transpose of the lower triangular factor */ 1019 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1020 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1021 1022 /* set the matrix descriptors of the lower triangular factor */ 1023 matrixType = cusparseGetMatType(loTriFactor->descr); 1024 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1025 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1026 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1027 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1028 1029 /* Create the matrix description */ 1030 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1031 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1032 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1033 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1034 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1035 1036 /* set the operation */ 1037 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1038 1039 /* allocate GPU space for the CSC of the lower triangular factor*/ 1040 loTriFactorT->csrMat = new CsrMatrix; 1041 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1042 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1043 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1044 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1045 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1046 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1047 1048 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1049 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1050 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1051 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1052 loTriFactor->csrMat->values->data().get(), 1053 loTriFactor->csrMat->row_offsets->data().get(), 1054 loTriFactor->csrMat->column_indices->data().get(), 1055 loTriFactorT->csrMat->values->data().get(), 1056 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1057 CUSPARSE_ACTION_NUMERIC,indexBase, 1058 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1059 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1060 #endif 1061 1062 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1063 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1064 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1065 loTriFactor->csrMat->values->data().get(), 1066 loTriFactor->csrMat->row_offsets->data().get(), 1067 loTriFactor->csrMat->column_indices->data().get(), 1068 loTriFactorT->csrMat->values->data().get(), 1069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1070 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1071 CUSPARSE_ACTION_NUMERIC, indexBase, 1072 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1073 #else 1074 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1075 CUSPARSE_ACTION_NUMERIC, indexBase 1076 #endif 1077 );CHKERRCUSPARSE(stat); 1078 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1079 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1080 1081 /* Create the solve analysis information */ 1082 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1083 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1084 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1085 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1086 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1087 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1088 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1089 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1090 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1091 #endif 1092 1093 /* perform the solve analysis */ 1094 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1095 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1096 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1097 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1098 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1099 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1100 #endif 1101 );CHKERRCUSPARSE(stat); 1102 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1103 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1104 1105 /* assign the pointer */ 1106 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1107 1108 /*********************************************/ 1109 /* Now the Transpose of the Upper Tri Factor */ 1110 /*********************************************/ 1111 1112 /* allocate space for the transpose of the upper triangular factor */ 1113 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1114 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1115 1116 /* set the matrix descriptors of the upper triangular factor */ 1117 matrixType = cusparseGetMatType(upTriFactor->descr); 1118 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1119 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1120 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1121 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1122 1123 /* Create the matrix description */ 1124 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1125 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1126 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1127 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1128 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1129 1130 /* set the operation */ 1131 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1132 1133 /* allocate GPU space for the CSC of the upper triangular factor*/ 1134 upTriFactorT->csrMat = new CsrMatrix; 1135 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1136 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1137 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1138 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1139 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1140 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1141 1142 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1143 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1144 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1145 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1146 upTriFactor->csrMat->values->data().get(), 1147 upTriFactor->csrMat->row_offsets->data().get(), 1148 upTriFactor->csrMat->column_indices->data().get(), 1149 upTriFactorT->csrMat->values->data().get(), 1150 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1151 CUSPARSE_ACTION_NUMERIC,indexBase, 1152 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1153 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1154 #endif 1155 1156 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1157 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1158 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1159 upTriFactor->csrMat->values->data().get(), 1160 upTriFactor->csrMat->row_offsets->data().get(), 1161 upTriFactor->csrMat->column_indices->data().get(), 1162 upTriFactorT->csrMat->values->data().get(), 1163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1164 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1165 CUSPARSE_ACTION_NUMERIC, indexBase, 1166 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1167 #else 1168 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1169 CUSPARSE_ACTION_NUMERIC, indexBase 1170 #endif 1171 );CHKERRCUSPARSE(stat); 1172 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1173 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1174 1175 /* Create the solve analysis information */ 1176 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1177 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1178 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1179 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1180 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1181 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1182 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1183 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1184 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1185 #endif 1186 1187 /* perform the solve analysis */ 1188 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1189 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1190 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1191 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1192 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1193 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1194 #endif 1195 );CHKERRCUSPARSE(stat); 1196 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1197 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1198 1199 /* assign the pointer */ 1200 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1201 PetscFunctionReturn(0); 1202 } 1203 1204 struct PetscScalarToPetscInt 1205 { 1206 __host__ __device__ 1207 PetscInt operator()(PetscScalar s) 1208 { 1209 return (PetscInt)PetscRealPart(s); 1210 } 1211 }; 1212 1213 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1214 { 1215 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1216 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1217 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1218 cusparseStatus_t stat; 1219 cusparseIndexBase_t indexBase; 1220 cudaError_t err; 1221 PetscErrorCode ierr; 1222 1223 PetscFunctionBegin; 1224 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1225 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1226 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1227 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 1228 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1229 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct"); 1230 if (A->transupdated) PetscFunctionReturn(0); 1231 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1232 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1233 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1234 } 1235 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1236 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1237 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1238 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1239 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1240 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1241 1242 /* set alpha and beta */ 1243 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1244 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1245 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1246 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1247 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1248 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1249 1250 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1251 CsrMatrix *matrixT = new CsrMatrix; 1252 matstructT->mat = matrixT; 1253 matrixT->num_rows = A->cmap->n; 1254 matrixT->num_cols = A->rmap->n; 1255 matrixT->num_entries = a->nz; 1256 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1257 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1258 matrixT->values = new THRUSTARRAY(a->nz); 1259 1260 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1261 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1262 1263 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1264 stat = cusparseCreateCsr(&matstructT->matDescr, 1265 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1266 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1267 matrixT->values->data().get(), 1268 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1269 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1270 #endif 1271 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1273 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1274 #else 1275 CsrMatrix *temp = new CsrMatrix; 1276 CsrMatrix *tempT = new CsrMatrix; 1277 /* First convert HYB to CSR */ 1278 temp->num_rows = A->rmap->n; 1279 temp->num_cols = A->cmap->n; 1280 temp->num_entries = a->nz; 1281 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1282 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1283 temp->values = new THRUSTARRAY(a->nz); 1284 1285 stat = cusparse_hyb2csr(cusparsestruct->handle, 1286 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1287 temp->values->data().get(), 1288 temp->row_offsets->data().get(), 1289 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1290 1291 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1292 tempT->num_rows = A->rmap->n; 1293 tempT->num_cols = A->cmap->n; 1294 tempT->num_entries = a->nz; 1295 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1296 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1297 tempT->values = new THRUSTARRAY(a->nz); 1298 1299 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1300 temp->num_cols, temp->num_entries, 1301 temp->values->data().get(), 1302 temp->row_offsets->data().get(), 1303 temp->column_indices->data().get(), 1304 tempT->values->data().get(), 1305 tempT->column_indices->data().get(), 1306 tempT->row_offsets->data().get(), 1307 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1308 1309 /* Last, convert CSC to HYB */ 1310 cusparseHybMat_t hybMat; 1311 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1312 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1313 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1314 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1315 matstructT->descr, tempT->values->data().get(), 1316 tempT->row_offsets->data().get(), 1317 tempT->column_indices->data().get(), 1318 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1319 1320 /* assign the pointer */ 1321 matstructT->mat = hybMat; 1322 A->transupdated = PETSC_TRUE; 1323 /* delete temporaries */ 1324 if (tempT) { 1325 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1326 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1327 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1328 delete (CsrMatrix*) tempT; 1329 } 1330 if (temp) { 1331 if (temp->values) delete (THRUSTARRAY*) temp->values; 1332 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1333 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1334 delete (CsrMatrix*) temp; 1335 } 1336 #endif 1337 } 1338 } 1339 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1340 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1341 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1342 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix"); 1343 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows"); 1344 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols"); 1345 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values"); 1346 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT"); 1347 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows"); 1348 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols"); 1349 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values"); 1350 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1351 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1352 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1353 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1354 } 1355 if (!cusparsestruct->csr2csc_i) { 1356 THRUSTARRAY csr2csc_a(matrix->num_entries); 1357 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1358 1359 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1360 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1361 void *csr2cscBuffer; 1362 size_t csr2cscBufferSize; 1363 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1364 A->cmap->n, matrix->num_entries, 1365 matrix->values->data().get(), 1366 cusparsestruct->rowoffsets_gpu->data().get(), 1367 matrix->column_indices->data().get(), 1368 matrixT->values->data().get(), 1369 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1370 CUSPARSE_ACTION_NUMERIC,indexBase, 1371 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1372 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1373 #endif 1374 1375 if (matrix->num_entries) { 1376 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1377 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1378 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1379 1380 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1381 should be filled with indexBase. So I just take a shortcut here. 1382 */ 1383 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1384 A->cmap->n,matrix->num_entries, 1385 csr2csc_a.data().get(), 1386 cusparsestruct->rowoffsets_gpu->data().get(), 1387 matrix->column_indices->data().get(), 1388 matrixT->values->data().get(), 1389 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1390 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1391 CUSPARSE_ACTION_NUMERIC,indexBase, 1392 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1393 #else 1394 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1395 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1396 #endif 1397 } else { 1398 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1399 } 1400 1401 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1402 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1403 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1404 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1405 #endif 1406 } 1407 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1408 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1409 matrixT->values->begin())); 1410 } 1411 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1412 /* the compressed row indices is not used for matTranspose */ 1413 matstructT->cprowIndices = NULL; 1414 /* assign the pointer */ 1415 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1416 A->transupdated = PETSC_TRUE; 1417 PetscFunctionReturn(0); 1418 } 1419 1420 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1421 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1422 { 1423 PetscInt n = xx->map->n; 1424 const PetscScalar *barray; 1425 PetscScalar *xarray; 1426 thrust::device_ptr<const PetscScalar> bGPU; 1427 thrust::device_ptr<PetscScalar> xGPU; 1428 cusparseStatus_t stat; 1429 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1430 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1431 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1432 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1433 PetscErrorCode ierr; 1434 cudaError_t cerr; 1435 1436 PetscFunctionBegin; 1437 /* Analyze the matrix and create the transpose ... on the fly */ 1438 if (!loTriFactorT && !upTriFactorT) { 1439 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1440 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1441 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1442 } 1443 1444 /* Get the GPU pointers */ 1445 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1446 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1447 xGPU = thrust::device_pointer_cast(xarray); 1448 bGPU = thrust::device_pointer_cast(barray); 1449 1450 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1451 /* First, reorder with the row permutation */ 1452 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1453 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1454 xGPU); 1455 1456 /* First, solve U */ 1457 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1458 upTriFactorT->csrMat->num_rows, 1459 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1460 upTriFactorT->csrMat->num_entries, 1461 #endif 1462 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1463 upTriFactorT->csrMat->values->data().get(), 1464 upTriFactorT->csrMat->row_offsets->data().get(), 1465 upTriFactorT->csrMat->column_indices->data().get(), 1466 upTriFactorT->solveInfo, 1467 xarray, tempGPU->data().get() 1468 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1469 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1470 #endif 1471 );CHKERRCUSPARSE(stat); 1472 1473 /* Then, solve L */ 1474 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1475 loTriFactorT->csrMat->num_rows, 1476 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1477 loTriFactorT->csrMat->num_entries, 1478 #endif 1479 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1480 loTriFactorT->csrMat->values->data().get(), 1481 loTriFactorT->csrMat->row_offsets->data().get(), 1482 loTriFactorT->csrMat->column_indices->data().get(), 1483 loTriFactorT->solveInfo, 1484 tempGPU->data().get(), xarray 1485 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1486 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1487 #endif 1488 );CHKERRCUSPARSE(stat); 1489 1490 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1491 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1492 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1493 tempGPU->begin()); 1494 1495 /* Copy the temporary to the full solution. */ 1496 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1497 1498 /* restore */ 1499 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1500 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1501 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1502 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1503 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1504 PetscFunctionReturn(0); 1505 } 1506 1507 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1508 { 1509 const PetscScalar *barray; 1510 PetscScalar *xarray; 1511 cusparseStatus_t stat; 1512 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1513 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1514 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1515 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1516 PetscErrorCode ierr; 1517 cudaError_t cerr; 1518 1519 PetscFunctionBegin; 1520 /* Analyze the matrix and create the transpose ... on the fly */ 1521 if (!loTriFactorT && !upTriFactorT) { 1522 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1523 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1524 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1525 } 1526 1527 /* Get the GPU pointers */ 1528 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1529 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1530 1531 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1532 /* First, solve U */ 1533 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1534 upTriFactorT->csrMat->num_rows, 1535 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1536 upTriFactorT->csrMat->num_entries, 1537 #endif 1538 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1539 upTriFactorT->csrMat->values->data().get(), 1540 upTriFactorT->csrMat->row_offsets->data().get(), 1541 upTriFactorT->csrMat->column_indices->data().get(), 1542 upTriFactorT->solveInfo, 1543 barray, tempGPU->data().get() 1544 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1545 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1546 #endif 1547 );CHKERRCUSPARSE(stat); 1548 1549 /* Then, solve L */ 1550 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1551 loTriFactorT->csrMat->num_rows, 1552 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1553 loTriFactorT->csrMat->num_entries, 1554 #endif 1555 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1556 loTriFactorT->csrMat->values->data().get(), 1557 loTriFactorT->csrMat->row_offsets->data().get(), 1558 loTriFactorT->csrMat->column_indices->data().get(), 1559 loTriFactorT->solveInfo, 1560 tempGPU->data().get(), xarray 1561 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1562 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1563 #endif 1564 );CHKERRCUSPARSE(stat); 1565 1566 /* restore */ 1567 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1568 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1569 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1570 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1571 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1572 PetscFunctionReturn(0); 1573 } 1574 1575 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1576 { 1577 const PetscScalar *barray; 1578 PetscScalar *xarray; 1579 thrust::device_ptr<const PetscScalar> bGPU; 1580 thrust::device_ptr<PetscScalar> xGPU; 1581 cusparseStatus_t stat; 1582 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1583 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1584 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1585 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1586 PetscErrorCode ierr; 1587 cudaError_t cerr; 1588 1589 PetscFunctionBegin; 1590 1591 /* Get the GPU pointers */ 1592 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1593 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1594 xGPU = thrust::device_pointer_cast(xarray); 1595 bGPU = thrust::device_pointer_cast(barray); 1596 1597 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1598 /* First, reorder with the row permutation */ 1599 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1600 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1601 tempGPU->begin()); 1602 1603 /* Next, solve L */ 1604 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1605 loTriFactor->csrMat->num_rows, 1606 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1607 loTriFactor->csrMat->num_entries, 1608 #endif 1609 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1610 loTriFactor->csrMat->values->data().get(), 1611 loTriFactor->csrMat->row_offsets->data().get(), 1612 loTriFactor->csrMat->column_indices->data().get(), 1613 loTriFactor->solveInfo, 1614 tempGPU->data().get(), xarray 1615 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1616 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1617 #endif 1618 );CHKERRCUSPARSE(stat); 1619 1620 /* Then, solve U */ 1621 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1622 upTriFactor->csrMat->num_rows, 1623 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1624 upTriFactor->csrMat->num_entries, 1625 #endif 1626 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1627 upTriFactor->csrMat->values->data().get(), 1628 upTriFactor->csrMat->row_offsets->data().get(), 1629 upTriFactor->csrMat->column_indices->data().get(), 1630 upTriFactor->solveInfo, 1631 xarray, tempGPU->data().get() 1632 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1633 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1634 #endif 1635 );CHKERRCUSPARSE(stat); 1636 1637 /* Last, reorder with the column permutation */ 1638 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1639 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1640 xGPU); 1641 1642 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1643 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1644 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1645 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1646 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1647 PetscFunctionReturn(0); 1648 } 1649 1650 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1651 { 1652 const PetscScalar *barray; 1653 PetscScalar *xarray; 1654 cusparseStatus_t stat; 1655 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1656 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1657 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1658 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1659 PetscErrorCode ierr; 1660 cudaError_t cerr; 1661 1662 PetscFunctionBegin; 1663 /* Get the GPU pointers */ 1664 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1665 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1666 1667 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1668 /* First, solve L */ 1669 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1670 loTriFactor->csrMat->num_rows, 1671 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1672 loTriFactor->csrMat->num_entries, 1673 #endif 1674 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1675 loTriFactor->csrMat->values->data().get(), 1676 loTriFactor->csrMat->row_offsets->data().get(), 1677 loTriFactor->csrMat->column_indices->data().get(), 1678 loTriFactor->solveInfo, 1679 barray, tempGPU->data().get() 1680 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1681 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1682 #endif 1683 );CHKERRCUSPARSE(stat); 1684 1685 /* Next, solve U */ 1686 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1687 upTriFactor->csrMat->num_rows, 1688 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1689 upTriFactor->csrMat->num_entries, 1690 #endif 1691 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1692 upTriFactor->csrMat->values->data().get(), 1693 upTriFactor->csrMat->row_offsets->data().get(), 1694 upTriFactor->csrMat->column_indices->data().get(), 1695 upTriFactor->solveInfo, 1696 tempGPU->data().get(), xarray 1697 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1698 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1699 #endif 1700 );CHKERRCUSPARSE(stat); 1701 1702 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1703 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1704 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1705 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1706 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1707 PetscFunctionReturn(0); 1708 } 1709 1710 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1711 { 1712 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1713 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1714 cudaError_t cerr; 1715 PetscErrorCode ierr; 1716 1717 PetscFunctionBegin; 1718 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1719 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1720 1721 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1722 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1723 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1724 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1725 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1726 A->offloadmask = PETSC_OFFLOAD_BOTH; 1727 } 1728 PetscFunctionReturn(0); 1729 } 1730 1731 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1732 { 1733 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1734 PetscErrorCode ierr; 1735 1736 PetscFunctionBegin; 1737 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1738 *array = a->a; 1739 A->offloadmask = PETSC_OFFLOAD_CPU; 1740 PetscFunctionReturn(0); 1741 } 1742 1743 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1744 { 1745 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1746 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1747 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1748 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1749 PetscErrorCode ierr; 1750 cusparseStatus_t stat; 1751 PetscBool both = PETSC_TRUE; 1752 cudaError_t err; 1753 1754 PetscFunctionBegin; 1755 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1756 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1757 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1758 CsrMatrix *matrix; 1759 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1760 1761 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1762 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1763 matrix->values->assign(a->a, a->a+a->nz); 1764 err = WaitForCUDA();CHKERRCUDA(err); 1765 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1766 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1767 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1768 } else { 1769 PetscInt nnz; 1770 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1771 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1772 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1773 delete cusparsestruct->workVector; 1774 delete cusparsestruct->rowoffsets_gpu; 1775 cusparsestruct->workVector = NULL; 1776 cusparsestruct->rowoffsets_gpu = NULL; 1777 try { 1778 if (a->compressedrow.use) { 1779 m = a->compressedrow.nrows; 1780 ii = a->compressedrow.i; 1781 ridx = a->compressedrow.rindex; 1782 } else { 1783 m = A->rmap->n; 1784 ii = a->i; 1785 ridx = NULL; 1786 } 1787 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1788 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1789 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1790 else nnz = a->nz; 1791 1792 /* create cusparse matrix */ 1793 cusparsestruct->nrows = m; 1794 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1795 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1796 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1797 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1798 1799 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1800 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1801 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1802 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1803 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1804 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1805 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1806 1807 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1808 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1809 /* set the matrix */ 1810 CsrMatrix *mat= new CsrMatrix; 1811 mat->num_rows = m; 1812 mat->num_cols = A->cmap->n; 1813 mat->num_entries = nnz; 1814 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1815 mat->row_offsets->assign(ii, ii + m+1); 1816 1817 mat->column_indices = new THRUSTINTARRAY32(nnz); 1818 mat->column_indices->assign(a->j, a->j+nnz); 1819 1820 mat->values = new THRUSTARRAY(nnz); 1821 if (a->a) mat->values->assign(a->a, a->a+nnz); 1822 1823 /* assign the pointer */ 1824 matstruct->mat = mat; 1825 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1826 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1827 stat = cusparseCreateCsr(&matstruct->matDescr, 1828 mat->num_rows, mat->num_cols, mat->num_entries, 1829 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1830 mat->values->data().get(), 1831 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1832 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1833 } 1834 #endif 1835 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1836 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1837 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1838 #else 1839 CsrMatrix *mat= new CsrMatrix; 1840 mat->num_rows = m; 1841 mat->num_cols = A->cmap->n; 1842 mat->num_entries = nnz; 1843 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1844 mat->row_offsets->assign(ii, ii + m+1); 1845 1846 mat->column_indices = new THRUSTINTARRAY32(nnz); 1847 mat->column_indices->assign(a->j, a->j+nnz); 1848 1849 mat->values = new THRUSTARRAY(nnz); 1850 if (a->a) mat->values->assign(a->a, a->a+nnz); 1851 1852 cusparseHybMat_t hybMat; 1853 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1854 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1855 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1856 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1857 matstruct->descr, mat->values->data().get(), 1858 mat->row_offsets->data().get(), 1859 mat->column_indices->data().get(), 1860 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1861 /* assign the pointer */ 1862 matstruct->mat = hybMat; 1863 1864 if (mat) { 1865 if (mat->values) delete (THRUSTARRAY*)mat->values; 1866 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1867 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1868 delete (CsrMatrix*)mat; 1869 } 1870 #endif 1871 } 1872 1873 /* assign the compressed row indices */ 1874 if (a->compressedrow.use) { 1875 cusparsestruct->workVector = new THRUSTARRAY(m); 1876 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1877 matstruct->cprowIndices->assign(ridx,ridx+m); 1878 tmp = m; 1879 } else { 1880 cusparsestruct->workVector = NULL; 1881 matstruct->cprowIndices = NULL; 1882 tmp = 0; 1883 } 1884 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1885 1886 /* assign the pointer */ 1887 cusparsestruct->mat = matstruct; 1888 } catch(char *ex) { 1889 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1890 } 1891 err = WaitForCUDA();CHKERRCUDA(err); 1892 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1893 cusparsestruct->nonzerostate = A->nonzerostate; 1894 } 1895 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1896 } 1897 PetscFunctionReturn(0); 1898 } 1899 1900 struct VecCUDAPlusEquals 1901 { 1902 template <typename Tuple> 1903 __host__ __device__ 1904 void operator()(Tuple t) 1905 { 1906 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1907 } 1908 }; 1909 1910 struct VecCUDAEquals 1911 { 1912 template <typename Tuple> 1913 __host__ __device__ 1914 void operator()(Tuple t) 1915 { 1916 thrust::get<1>(t) = thrust::get<0>(t); 1917 } 1918 }; 1919 1920 struct VecCUDAEqualsReverse 1921 { 1922 template <typename Tuple> 1923 __host__ __device__ 1924 void operator()(Tuple t) 1925 { 1926 thrust::get<0>(t) = thrust::get<1>(t); 1927 } 1928 }; 1929 1930 struct MatMatCusparse { 1931 PetscBool cisdense; 1932 PetscScalar *Bt; 1933 Mat X; 1934 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1935 PetscLogDouble flops; 1936 CsrMatrix *Bcsr; 1937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1938 cusparseSpMatDescr_t matSpBDescr; 1939 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1940 cusparseDnMatDescr_t matBDescr; 1941 cusparseDnMatDescr_t matCDescr; 1942 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1943 size_t mmBufferSize; 1944 void *mmBuffer; 1945 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1946 cusparseSpGEMMDescr_t spgemmDesc; 1947 #endif 1948 }; 1949 1950 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1951 { 1952 PetscErrorCode ierr; 1953 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1954 cudaError_t cerr; 1955 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1956 cusparseStatus_t stat; 1957 #endif 1958 1959 PetscFunctionBegin; 1960 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1961 delete mmdata->Bcsr; 1962 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1963 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1964 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1965 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1966 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1967 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1968 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1969 #endif 1970 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1971 ierr = PetscFree(data);CHKERRQ(ierr); 1972 PetscFunctionReturn(0); 1973 } 1974 1975 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1976 1977 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1978 { 1979 Mat_Product *product = C->product; 1980 Mat A,B; 1981 PetscInt m,n,blda,clda; 1982 PetscBool flg,biscuda; 1983 Mat_SeqAIJCUSPARSE *cusp; 1984 cusparseStatus_t stat; 1985 cusparseOperation_t opA; 1986 const PetscScalar *barray; 1987 PetscScalar *carray; 1988 PetscErrorCode ierr; 1989 MatMatCusparse *mmdata; 1990 Mat_SeqAIJCUSPARSEMultStruct *mat; 1991 CsrMatrix *csrmat; 1992 cudaError_t cerr; 1993 1994 PetscFunctionBegin; 1995 MatCheckProduct(C,1); 1996 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1997 mmdata = (MatMatCusparse*)product->data; 1998 A = product->A; 1999 B = product->B; 2000 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2001 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2002 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2003 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2004 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2005 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2006 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2007 switch (product->type) { 2008 case MATPRODUCT_AB: 2009 case MATPRODUCT_PtAP: 2010 mat = cusp->mat; 2011 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2012 m = A->rmap->n; 2013 n = B->cmap->n; 2014 break; 2015 case MATPRODUCT_AtB: 2016 if (!A->form_explicit_transpose) { 2017 mat = cusp->mat; 2018 opA = CUSPARSE_OPERATION_TRANSPOSE; 2019 } else { 2020 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2021 mat = cusp->matTranspose; 2022 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2023 } 2024 m = A->cmap->n; 2025 n = B->cmap->n; 2026 break; 2027 case MATPRODUCT_ABt: 2028 case MATPRODUCT_RARt: 2029 mat = cusp->mat; 2030 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2031 m = A->rmap->n; 2032 n = B->rmap->n; 2033 break; 2034 default: 2035 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2036 } 2037 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2038 csrmat = (CsrMatrix*)mat->mat; 2039 /* if the user passed a CPU matrix, copy the data to the GPU */ 2040 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2041 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2042 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2043 2044 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2045 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2046 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2047 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2048 } else { 2049 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2050 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2051 } 2052 2053 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2054 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2055 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2056 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2057 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2058 size_t mmBufferSize; 2059 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2060 if (!mmdata->matBDescr) { 2061 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2062 mmdata->Blda = blda; 2063 } 2064 2065 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2066 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2067 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2068 mmdata->Clda = clda; 2069 } 2070 2071 if (!mat->matDescr) { 2072 stat = cusparseCreateCsr(&mat->matDescr, 2073 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2074 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2075 csrmat->values->data().get(), 2076 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2077 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2078 } 2079 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2080 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2081 mmdata->matCDescr,cusparse_scalartype, 2082 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2083 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2084 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2085 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2086 mmdata->mmBufferSize = mmBufferSize; 2087 } 2088 mmdata->initialized = PETSC_TRUE; 2089 } else { 2090 /* to be safe, always update pointers of the mats */ 2091 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2092 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2093 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2094 } 2095 2096 /* do cusparseSpMM, which supports transpose on B */ 2097 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2098 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2099 mmdata->matCDescr,cusparse_scalartype, 2100 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2101 #else 2102 PetscInt k; 2103 /* cusparseXcsrmm does not support transpose on B */ 2104 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2105 cublasHandle_t cublasv2handle; 2106 cublasStatus_t cerr; 2107 2108 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2109 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2110 B->cmap->n,B->rmap->n, 2111 &PETSC_CUSPARSE_ONE ,barray,blda, 2112 &PETSC_CUSPARSE_ZERO,barray,blda, 2113 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2114 blda = B->cmap->n; 2115 k = B->cmap->n; 2116 } else { 2117 k = B->rmap->n; 2118 } 2119 2120 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2121 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2122 csrmat->num_entries,mat->alpha_one,mat->descr, 2123 csrmat->values->data().get(), 2124 csrmat->row_offsets->data().get(), 2125 csrmat->column_indices->data().get(), 2126 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2127 carray,clda);CHKERRCUSPARSE(stat); 2128 #endif 2129 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2130 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2131 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2132 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2133 if (product->type == MATPRODUCT_RARt) { 2134 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2135 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2136 } else if (product->type == MATPRODUCT_PtAP) { 2137 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2138 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2139 } else { 2140 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2141 } 2142 if (mmdata->cisdense) { 2143 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2144 } 2145 if (!biscuda) { 2146 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2147 } 2148 PetscFunctionReturn(0); 2149 } 2150 2151 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2152 { 2153 Mat_Product *product = C->product; 2154 Mat A,B; 2155 PetscInt m,n; 2156 PetscBool cisdense,flg; 2157 PetscErrorCode ierr; 2158 MatMatCusparse *mmdata; 2159 Mat_SeqAIJCUSPARSE *cusp; 2160 2161 PetscFunctionBegin; 2162 MatCheckProduct(C,1); 2163 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2164 A = product->A; 2165 B = product->B; 2166 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2167 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2168 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2169 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2170 switch (product->type) { 2171 case MATPRODUCT_AB: 2172 m = A->rmap->n; 2173 n = B->cmap->n; 2174 break; 2175 case MATPRODUCT_AtB: 2176 m = A->cmap->n; 2177 n = B->cmap->n; 2178 break; 2179 case MATPRODUCT_ABt: 2180 m = A->rmap->n; 2181 n = B->rmap->n; 2182 break; 2183 case MATPRODUCT_PtAP: 2184 m = B->cmap->n; 2185 n = B->cmap->n; 2186 break; 2187 case MATPRODUCT_RARt: 2188 m = B->rmap->n; 2189 n = B->rmap->n; 2190 break; 2191 default: 2192 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2193 } 2194 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2195 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2196 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2197 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2198 2199 /* product data */ 2200 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2201 mmdata->cisdense = cisdense; 2202 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2203 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2204 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2205 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2206 } 2207 #endif 2208 /* for these products we need intermediate storage */ 2209 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2210 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2211 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2212 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2213 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2214 } else { 2215 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2216 } 2217 } 2218 C->product->data = mmdata; 2219 C->product->destroy = MatDestroy_MatMatCusparse; 2220 2221 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2222 PetscFunctionReturn(0); 2223 } 2224 2225 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2226 { 2227 Mat_Product *product = C->product; 2228 Mat A,B; 2229 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2230 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2231 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2232 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2233 PetscBool flg; 2234 PetscErrorCode ierr; 2235 cusparseStatus_t stat; 2236 cudaError_t cerr; 2237 MatProductType ptype; 2238 MatMatCusparse *mmdata; 2239 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2240 cusparseSpMatDescr_t BmatSpDescr; 2241 #endif 2242 2243 PetscFunctionBegin; 2244 MatCheckProduct(C,1); 2245 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2246 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2247 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2248 mmdata = (MatMatCusparse*)C->product->data; 2249 A = product->A; 2250 B = product->B; 2251 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2252 mmdata->reusesym = PETSC_FALSE; 2253 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2254 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2255 Cmat = Ccusp->mat; 2256 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2257 Ccsr = (CsrMatrix*)Cmat->mat; 2258 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2259 goto finalize; 2260 } 2261 if (!c->nz) goto finalize; 2262 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2263 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2264 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2265 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2266 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2267 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2268 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2269 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2270 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2271 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2272 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2273 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2274 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2275 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2276 2277 ptype = product->type; 2278 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2279 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2280 switch (ptype) { 2281 case MATPRODUCT_AB: 2282 Amat = Acusp->mat; 2283 Bmat = Bcusp->mat; 2284 break; 2285 case MATPRODUCT_AtB: 2286 Amat = Acusp->matTranspose; 2287 Bmat = Bcusp->mat; 2288 break; 2289 case MATPRODUCT_ABt: 2290 Amat = Acusp->mat; 2291 Bmat = Bcusp->matTranspose; 2292 break; 2293 default: 2294 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2295 } 2296 Cmat = Ccusp->mat; 2297 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2298 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2299 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2300 Acsr = (CsrMatrix*)Amat->mat; 2301 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2302 Ccsr = (CsrMatrix*)Cmat->mat; 2303 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2304 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2305 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2306 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2308 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2309 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2310 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2311 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2312 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2313 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2314 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2315 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2316 #else 2317 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2318 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2319 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2320 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2321 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2322 #endif 2323 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2324 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2325 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2326 C->offloadmask = PETSC_OFFLOAD_GPU; 2327 finalize: 2328 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2329 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2330 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2331 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2332 c->reallocs = 0; 2333 C->info.mallocs += 0; 2334 C->info.nz_unneeded = 0; 2335 C->assembled = C->was_assembled = PETSC_TRUE; 2336 C->num_ass++; 2337 PetscFunctionReturn(0); 2338 } 2339 2340 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2341 { 2342 Mat_Product *product = C->product; 2343 Mat A,B; 2344 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2345 Mat_SeqAIJ *a,*b,*c; 2346 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2347 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2348 PetscInt i,j,m,n,k; 2349 PetscBool flg; 2350 PetscErrorCode ierr; 2351 cusparseStatus_t stat; 2352 cudaError_t cerr; 2353 MatProductType ptype; 2354 MatMatCusparse *mmdata; 2355 PetscLogDouble flops; 2356 PetscBool biscompressed,ciscompressed; 2357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2358 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2359 size_t bufSize2; 2360 cusparseSpMatDescr_t BmatSpDescr; 2361 #else 2362 int cnz; 2363 #endif 2364 2365 PetscFunctionBegin; 2366 MatCheckProduct(C,1); 2367 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2368 A = product->A; 2369 B = product->B; 2370 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2371 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2372 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2373 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2374 a = (Mat_SeqAIJ*)A->data; 2375 b = (Mat_SeqAIJ*)B->data; 2376 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2377 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2378 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2379 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2380 2381 /* product data */ 2382 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2383 C->product->data = mmdata; 2384 C->product->destroy = MatDestroy_MatMatCusparse; 2385 2386 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2387 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2388 ptype = product->type; 2389 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2390 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2391 biscompressed = PETSC_FALSE; 2392 ciscompressed = PETSC_FALSE; 2393 switch (ptype) { 2394 case MATPRODUCT_AB: 2395 m = A->rmap->n; 2396 n = B->cmap->n; 2397 k = A->cmap->n; 2398 Amat = Acusp->mat; 2399 Bmat = Bcusp->mat; 2400 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2401 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2402 break; 2403 case MATPRODUCT_AtB: 2404 m = A->cmap->n; 2405 n = B->cmap->n; 2406 k = A->rmap->n; 2407 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2408 Amat = Acusp->matTranspose; 2409 Bmat = Bcusp->mat; 2410 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2411 break; 2412 case MATPRODUCT_ABt: 2413 m = A->rmap->n; 2414 n = B->rmap->n; 2415 k = A->cmap->n; 2416 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2417 Amat = Acusp->mat; 2418 Bmat = Bcusp->matTranspose; 2419 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2420 break; 2421 default: 2422 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2423 } 2424 2425 /* create cusparse matrix */ 2426 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2427 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2428 c = (Mat_SeqAIJ*)C->data; 2429 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2430 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2431 Ccsr = new CsrMatrix; 2432 2433 c->compressedrow.use = ciscompressed; 2434 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2435 c->compressedrow.nrows = a->compressedrow.nrows; 2436 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2437 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2438 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2439 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2440 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2441 } else { 2442 c->compressedrow.nrows = 0; 2443 c->compressedrow.i = NULL; 2444 c->compressedrow.rindex = NULL; 2445 Ccusp->workVector = NULL; 2446 Cmat->cprowIndices = NULL; 2447 } 2448 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2449 Ccusp->mat = Cmat; 2450 Ccusp->mat->mat = Ccsr; 2451 Ccsr->num_rows = Ccusp->nrows; 2452 Ccsr->num_cols = n; 2453 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2454 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2455 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2456 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2457 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2458 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2459 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2460 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2461 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2462 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2463 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2464 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2465 c->nz = 0; 2466 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2467 Ccsr->values = new THRUSTARRAY(c->nz); 2468 goto finalizesym; 2469 } 2470 2471 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2472 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2473 Acsr = (CsrMatrix*)Amat->mat; 2474 if (!biscompressed) { 2475 Bcsr = (CsrMatrix*)Bmat->mat; 2476 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2477 BmatSpDescr = Bmat->matDescr; 2478 #endif 2479 } else { /* we need to use row offsets for the full matrix */ 2480 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2481 Bcsr = new CsrMatrix; 2482 Bcsr->num_rows = B->rmap->n; 2483 Bcsr->num_cols = cBcsr->num_cols; 2484 Bcsr->num_entries = cBcsr->num_entries; 2485 Bcsr->column_indices = cBcsr->column_indices; 2486 Bcsr->values = cBcsr->values; 2487 if (!Bcusp->rowoffsets_gpu) { 2488 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2489 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2490 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2491 } 2492 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2493 mmdata->Bcsr = Bcsr; 2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2495 if (Bcsr->num_rows && Bcsr->num_cols) { 2496 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2497 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2498 Bcsr->values->data().get(), 2499 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2500 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2501 } 2502 BmatSpDescr = mmdata->matSpBDescr; 2503 #endif 2504 } 2505 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2506 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2507 /* precompute flops count */ 2508 if (ptype == MATPRODUCT_AB) { 2509 for (i=0, flops = 0; i<A->rmap->n; i++) { 2510 const PetscInt st = a->i[i]; 2511 const PetscInt en = a->i[i+1]; 2512 for (j=st; j<en; j++) { 2513 const PetscInt brow = a->j[j]; 2514 flops += 2.*(b->i[brow+1] - b->i[brow]); 2515 } 2516 } 2517 } else if (ptype == MATPRODUCT_AtB) { 2518 for (i=0, flops = 0; i<A->rmap->n; i++) { 2519 const PetscInt anzi = a->i[i+1] - a->i[i]; 2520 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2521 flops += (2.*anzi)*bnzi; 2522 } 2523 } else { /* TODO */ 2524 flops = 0.; 2525 } 2526 2527 mmdata->flops = flops; 2528 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2530 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2531 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2532 NULL, NULL, NULL, 2533 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2534 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2535 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2536 /* ask bufferSize bytes for external memory */ 2537 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2538 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2539 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2540 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2541 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2542 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2543 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2544 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2545 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2546 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2547 /* ask bufferSize again bytes for external memory */ 2548 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2549 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2550 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2551 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2552 /* The CUSPARSE documentation is not clear, nor the API 2553 We need both buffers to perform the operations properly! 2554 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2555 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2556 is stored in the descriptor! What a messy API... */ 2557 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2558 /* compute the intermediate product of A * B */ 2559 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2560 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2561 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2562 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2563 /* get matrix C non-zero entries C_nnz1 */ 2564 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2565 c->nz = (PetscInt) C_nnz1; 2566 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2567 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2568 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2569 Ccsr->values = new THRUSTARRAY(c->nz); 2570 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2571 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2572 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2573 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2574 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2575 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2576 #else 2577 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2578 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2579 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2580 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2581 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2582 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2583 c->nz = cnz; 2584 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2585 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2586 Ccsr->values = new THRUSTARRAY(c->nz); 2587 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2588 2589 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2590 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2591 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2592 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2593 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2594 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2595 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2596 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2597 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2598 #endif 2599 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2600 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2601 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2602 finalizesym: 2603 c->singlemalloc = PETSC_FALSE; 2604 c->free_a = PETSC_TRUE; 2605 c->free_ij = PETSC_TRUE; 2606 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2607 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2608 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2609 PetscInt *d_i = c->i; 2610 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2611 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2612 ii = *Ccsr->row_offsets; 2613 jj = *Ccsr->column_indices; 2614 if (ciscompressed) d_i = c->compressedrow.i; 2615 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2616 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2617 } else { 2618 PetscInt *d_i = c->i; 2619 if (ciscompressed) d_i = c->compressedrow.i; 2620 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2621 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2622 } 2623 if (ciscompressed) { /* need to expand host row offsets */ 2624 PetscInt r = 0; 2625 c->i[0] = 0; 2626 for (k = 0; k < c->compressedrow.nrows; k++) { 2627 const PetscInt next = c->compressedrow.rindex[k]; 2628 const PetscInt old = c->compressedrow.i[k]; 2629 for (; r < next; r++) c->i[r+1] = old; 2630 } 2631 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2632 } 2633 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2634 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2635 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2636 c->maxnz = c->nz; 2637 c->nonzerorowcnt = 0; 2638 c->rmax = 0; 2639 for (k = 0; k < m; k++) { 2640 const PetscInt nn = c->i[k+1] - c->i[k]; 2641 c->ilen[k] = c->imax[k] = nn; 2642 c->nonzerorowcnt += (PetscInt)!!nn; 2643 c->rmax = PetscMax(c->rmax,nn); 2644 } 2645 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2646 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2647 Ccsr->num_entries = c->nz; 2648 2649 C->nonzerostate++; 2650 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2651 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2652 Ccusp->nonzerostate = C->nonzerostate; 2653 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2654 C->preallocated = PETSC_TRUE; 2655 C->assembled = PETSC_FALSE; 2656 C->was_assembled = PETSC_FALSE; 2657 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2658 mmdata->reusesym = PETSC_TRUE; 2659 C->offloadmask = PETSC_OFFLOAD_GPU; 2660 } 2661 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2662 PetscFunctionReturn(0); 2663 } 2664 2665 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2666 2667 /* handles sparse or dense B */ 2668 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2669 { 2670 Mat_Product *product = mat->product; 2671 PetscErrorCode ierr; 2672 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2673 2674 PetscFunctionBegin; 2675 MatCheckProduct(mat,1); 2676 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2677 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2678 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2679 } 2680 if (product->type == MATPRODUCT_ABC) { 2681 Ciscusp = PETSC_FALSE; 2682 if (!product->C->boundtocpu) { 2683 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2684 } 2685 } 2686 if (isdense) { 2687 switch (product->type) { 2688 case MATPRODUCT_AB: 2689 case MATPRODUCT_AtB: 2690 case MATPRODUCT_ABt: 2691 case MATPRODUCT_PtAP: 2692 case MATPRODUCT_RARt: 2693 if (product->A->boundtocpu) { 2694 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2695 } else { 2696 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2697 } 2698 break; 2699 case MATPRODUCT_ABC: 2700 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2701 break; 2702 default: 2703 break; 2704 } 2705 } else if (Biscusp && Ciscusp) { 2706 switch (product->type) { 2707 case MATPRODUCT_AB: 2708 case MATPRODUCT_AtB: 2709 case MATPRODUCT_ABt: 2710 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2711 break; 2712 case MATPRODUCT_PtAP: 2713 case MATPRODUCT_RARt: 2714 case MATPRODUCT_ABC: 2715 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2716 break; 2717 default: 2718 break; 2719 } 2720 } else { /* fallback for AIJ */ 2721 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2722 } 2723 PetscFunctionReturn(0); 2724 } 2725 2726 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2727 { 2728 PetscErrorCode ierr; 2729 2730 PetscFunctionBegin; 2731 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2732 PetscFunctionReturn(0); 2733 } 2734 2735 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2736 { 2737 PetscErrorCode ierr; 2738 2739 PetscFunctionBegin; 2740 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2741 PetscFunctionReturn(0); 2742 } 2743 2744 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2745 { 2746 PetscErrorCode ierr; 2747 2748 PetscFunctionBegin; 2749 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2750 PetscFunctionReturn(0); 2751 } 2752 2753 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2754 { 2755 PetscErrorCode ierr; 2756 2757 PetscFunctionBegin; 2758 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2759 PetscFunctionReturn(0); 2760 } 2761 2762 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2763 { 2764 PetscErrorCode ierr; 2765 2766 PetscFunctionBegin; 2767 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2768 PetscFunctionReturn(0); 2769 } 2770 2771 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2772 { 2773 int i = blockIdx.x*blockDim.x + threadIdx.x; 2774 if (i < n) y[idx[i]] += x[i]; 2775 } 2776 2777 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2778 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2779 { 2780 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2781 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2782 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2783 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2784 PetscErrorCode ierr; 2785 cudaError_t cerr; 2786 cusparseStatus_t stat; 2787 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2788 PetscBool compressed; 2789 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2790 PetscInt nx,ny; 2791 #endif 2792 2793 PetscFunctionBegin; 2794 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2795 if (!a->nonzerorowcnt) { 2796 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2797 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2798 PetscFunctionReturn(0); 2799 } 2800 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2801 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2802 if (!trans) { 2803 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2804 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2805 } else { 2806 if (herm || !A->form_explicit_transpose) { 2807 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2808 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2809 } else { 2810 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2811 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2812 } 2813 } 2814 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2815 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2816 2817 try { 2818 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2819 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2820 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2821 2822 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2823 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2824 /* z = A x + beta y. 2825 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2826 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2827 */ 2828 xptr = xarray; 2829 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2830 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2831 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2832 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2833 allocated to accommodate different uses. So we get the length info directly from mat. 2834 */ 2835 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2836 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2837 nx = mat->num_cols; 2838 ny = mat->num_rows; 2839 } 2840 #endif 2841 } else { 2842 /* z = A^T x + beta y 2843 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2844 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2845 */ 2846 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2847 dptr = zarray; 2848 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2849 if (compressed) { /* Scatter x to work vector */ 2850 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2851 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2852 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2853 VecCUDAEqualsReverse()); 2854 } 2855 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2856 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2857 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2858 nx = mat->num_rows; 2859 ny = mat->num_cols; 2860 } 2861 #endif 2862 } 2863 2864 /* csr_spmv does y = alpha op(A) x + beta y */ 2865 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2866 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2867 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2868 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2869 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2870 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2871 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2872 matstruct->matDescr, 2873 matstruct->cuSpMV[opA].vecXDescr, beta, 2874 matstruct->cuSpMV[opA].vecYDescr, 2875 cusparse_scalartype, 2876 cusparsestruct->spmvAlg, 2877 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2878 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2879 2880 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2881 } else { 2882 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2883 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2884 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2885 } 2886 2887 stat = cusparseSpMV(cusparsestruct->handle, opA, 2888 matstruct->alpha_one, 2889 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2890 matstruct->cuSpMV[opA].vecXDescr, 2891 beta, 2892 matstruct->cuSpMV[opA].vecYDescr, 2893 cusparse_scalartype, 2894 cusparsestruct->spmvAlg, 2895 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2896 #else 2897 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2898 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2899 mat->num_rows, mat->num_cols, 2900 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2901 mat->values->data().get(), mat->row_offsets->data().get(), 2902 mat->column_indices->data().get(), xptr, beta, 2903 dptr);CHKERRCUSPARSE(stat); 2904 #endif 2905 } else { 2906 if (cusparsestruct->nrows) { 2907 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2908 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2909 #else 2910 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2911 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2912 matstruct->alpha_one, matstruct->descr, hybMat, 2913 xptr, beta, 2914 dptr);CHKERRCUSPARSE(stat); 2915 #endif 2916 } 2917 } 2918 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2919 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2920 2921 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2922 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2923 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2924 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2925 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2926 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2927 } 2928 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2929 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2930 } 2931 2932 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2933 if (compressed) { 2934 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2935 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2936 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2937 prevent that. So I just add a ScatterAdd kernel. 2938 */ 2939 #if 0 2940 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2941 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2942 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2943 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2944 VecCUDAPlusEquals()); 2945 #else 2946 PetscInt n = matstruct->cprowIndices->size(); 2947 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2948 #endif 2949 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2950 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2951 } 2952 } else { 2953 if (yy && yy != zz) { 2954 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2955 } 2956 } 2957 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2958 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2959 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2960 } catch(char *ex) { 2961 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2962 } 2963 if (yy) { 2964 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2965 } else { 2966 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2967 } 2968 PetscFunctionReturn(0); 2969 } 2970 2971 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2972 { 2973 PetscErrorCode ierr; 2974 2975 PetscFunctionBegin; 2976 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2977 PetscFunctionReturn(0); 2978 } 2979 2980 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2981 { 2982 PetscErrorCode ierr; 2983 PetscSplitCSRDataStructure *d_mat = NULL; 2984 PetscFunctionBegin; 2985 if (A->factortype == MAT_FACTOR_NONE) { 2986 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2987 } 2988 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2989 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2990 if (d_mat) { 2991 A->offloadmask = PETSC_OFFLOAD_GPU; 2992 } 2993 2994 PetscFunctionReturn(0); 2995 } 2996 2997 /* --------------------------------------------------------------------------------*/ 2998 /*@ 2999 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3000 (the default parallel PETSc format). This matrix will ultimately pushed down 3001 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3002 assembly performance the user should preallocate the matrix storage by setting 3003 the parameter nz (or the array nnz). By setting these parameters accurately, 3004 performance during matrix assembly can be increased by more than a factor of 50. 3005 3006 Collective 3007 3008 Input Parameters: 3009 + comm - MPI communicator, set to PETSC_COMM_SELF 3010 . m - number of rows 3011 . n - number of columns 3012 . nz - number of nonzeros per row (same for all rows) 3013 - nnz - array containing the number of nonzeros in the various rows 3014 (possibly different for each row) or NULL 3015 3016 Output Parameter: 3017 . A - the matrix 3018 3019 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3020 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3021 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3022 3023 Notes: 3024 If nnz is given then nz is ignored 3025 3026 The AIJ format (also called the Yale sparse matrix format or 3027 compressed row storage), is fully compatible with standard Fortran 77 3028 storage. That is, the stored row and column indices can begin at 3029 either one (as in Fortran) or zero. See the users' manual for details. 3030 3031 Specify the preallocated storage with either nz or nnz (not both). 3032 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3033 allocation. For large problems you MUST preallocate memory or you 3034 will get TERRIBLE performance, see the users' manual chapter on matrices. 3035 3036 By default, this format uses inodes (identical nodes) when possible, to 3037 improve numerical efficiency of matrix-vector products and solves. We 3038 search for consecutive rows with the same nonzero structure, thereby 3039 reusing matrix information to achieve increased efficiency. 3040 3041 Level: intermediate 3042 3043 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3044 @*/ 3045 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3046 { 3047 PetscErrorCode ierr; 3048 3049 PetscFunctionBegin; 3050 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3051 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3052 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3053 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3054 PetscFunctionReturn(0); 3055 } 3056 3057 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3058 { 3059 PetscErrorCode ierr; 3060 PetscSplitCSRDataStructure *d_mat = NULL; 3061 3062 PetscFunctionBegin; 3063 if (A->factortype == MAT_FACTOR_NONE) { 3064 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3065 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3066 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3067 } else { 3068 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3069 } 3070 if (d_mat) { 3071 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3072 cudaError_t err; 3073 PetscSplitCSRDataStructure h_mat; 3074 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3075 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3076 if (a->compressedrow.use) { 3077 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3078 } 3079 err = cudaFree(d_mat);CHKERRCUDA(err); 3080 } 3081 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3082 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3083 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3084 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3085 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3086 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3087 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3088 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3089 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3090 PetscFunctionReturn(0); 3091 } 3092 3093 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3094 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3095 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3096 { 3097 PetscErrorCode ierr; 3098 3099 PetscFunctionBegin; 3100 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3101 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3102 PetscFunctionReturn(0); 3103 } 3104 3105 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3106 { 3107 PetscErrorCode ierr; 3108 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3109 Mat_SeqAIJCUSPARSE *cy; 3110 Mat_SeqAIJCUSPARSE *cx; 3111 PetscScalar *ay; 3112 const PetscScalar *ax; 3113 CsrMatrix *csry,*csrx; 3114 cudaError_t cerr; 3115 3116 PetscFunctionBegin; 3117 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3118 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3119 if (X->ops->axpy != Y->ops->axpy) { 3120 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3121 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3122 PetscFunctionReturn(0); 3123 } 3124 /* if we are here, it means both matrices are bound to GPU */ 3125 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3126 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3127 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3128 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3129 csry = (CsrMatrix*)cy->mat->mat; 3130 csrx = (CsrMatrix*)cx->mat->mat; 3131 /* see if we can turn this into a cublas axpy */ 3132 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3133 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3134 if (eq) { 3135 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3136 } 3137 if (eq) str = SAME_NONZERO_PATTERN; 3138 } 3139 /* spgeam is buggy with one column */ 3140 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3141 3142 if (str == SUBSET_NONZERO_PATTERN) { 3143 cusparseStatus_t stat; 3144 PetscScalar b = 1.0; 3145 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3146 size_t bufferSize; 3147 void *buffer; 3148 #endif 3149 3150 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3151 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3152 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3153 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3154 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3155 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3156 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3157 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3158 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3159 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3160 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3161 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3162 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3163 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3164 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3165 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3166 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3167 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3168 #else 3169 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3170 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3171 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3172 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3173 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3174 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3175 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3176 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3177 #endif 3178 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3179 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3180 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3181 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3182 } else if (str == SAME_NONZERO_PATTERN) { 3183 cublasHandle_t cublasv2handle; 3184 cublasStatus_t berr; 3185 PetscBLASInt one = 1, bnz = 1; 3186 3187 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3188 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3189 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3190 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3191 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3192 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3193 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3194 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3195 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3196 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3197 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3198 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3199 } else { 3200 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3201 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3202 } 3203 PetscFunctionReturn(0); 3204 } 3205 3206 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3207 { 3208 PetscErrorCode ierr; 3209 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3210 PetscScalar *ay; 3211 cudaError_t cerr; 3212 cublasHandle_t cublasv2handle; 3213 cublasStatus_t berr; 3214 PetscBLASInt one = 1, bnz = 1; 3215 3216 PetscFunctionBegin; 3217 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3218 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3219 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3220 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3221 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3222 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3223 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3224 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3225 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3226 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3227 PetscFunctionReturn(0); 3228 } 3229 3230 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3231 { 3232 PetscErrorCode ierr; 3233 PetscBool both = PETSC_FALSE; 3234 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3235 3236 PetscFunctionBegin; 3237 if (A->factortype == MAT_FACTOR_NONE) { 3238 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3239 if (spptr->mat) { 3240 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3241 if (matrix->values) { 3242 both = PETSC_TRUE; 3243 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3244 } 3245 } 3246 if (spptr->matTranspose) { 3247 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3248 if (matrix->values) { 3249 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3250 } 3251 } 3252 } 3253 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3254 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3255 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3256 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3257 else A->offloadmask = PETSC_OFFLOAD_CPU; 3258 3259 PetscFunctionReturn(0); 3260 } 3261 3262 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3263 { 3264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3265 PetscErrorCode ierr; 3266 3267 PetscFunctionBegin; 3268 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3269 if (flg) { 3270 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3271 3272 A->ops->scale = MatScale_SeqAIJ; 3273 A->ops->axpy = MatAXPY_SeqAIJ; 3274 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3275 A->ops->mult = MatMult_SeqAIJ; 3276 A->ops->multadd = MatMultAdd_SeqAIJ; 3277 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3278 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3279 A->ops->multhermitiantranspose = NULL; 3280 A->ops->multhermitiantransposeadd = NULL; 3281 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3282 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3283 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3284 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3285 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3286 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3287 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3288 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3289 } else { 3290 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3291 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3292 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3293 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3294 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3295 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3296 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3297 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3298 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3299 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3300 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3301 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3302 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3303 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3304 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3305 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3306 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3307 } 3308 A->boundtocpu = flg; 3309 a->inode.use = flg; 3310 PetscFunctionReturn(0); 3311 } 3312 3313 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3314 { 3315 PetscErrorCode ierr; 3316 cusparseStatus_t stat; 3317 Mat B; 3318 3319 PetscFunctionBegin; 3320 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3321 if (reuse == MAT_INITIAL_MATRIX) { 3322 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3323 } else if (reuse == MAT_REUSE_MATRIX) { 3324 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3325 } 3326 B = *newmat; 3327 3328 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3329 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3330 3331 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3332 if (B->factortype == MAT_FACTOR_NONE) { 3333 Mat_SeqAIJCUSPARSE *spptr; 3334 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3335 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3336 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3337 spptr->format = MAT_CUSPARSE_CSR; 3338 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3339 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3340 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3341 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3342 #endif 3343 B->spptr = spptr; 3344 } else { 3345 Mat_SeqAIJCUSPARSETriFactors *spptr; 3346 3347 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3348 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3349 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3350 B->spptr = spptr; 3351 } 3352 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3353 } 3354 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3355 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3356 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3357 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3358 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3359 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3360 3361 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3362 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3363 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3364 PetscFunctionReturn(0); 3365 } 3366 3367 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3368 { 3369 PetscErrorCode ierr; 3370 3371 PetscFunctionBegin; 3372 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3373 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3374 PetscFunctionReturn(0); 3375 } 3376 3377 /*MC 3378 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3379 3380 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3381 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3382 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3383 3384 Options Database Keys: 3385 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3386 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3387 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3388 3389 Level: beginner 3390 3391 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3392 M*/ 3393 3394 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3395 3396 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3397 { 3398 PetscErrorCode ierr; 3399 3400 PetscFunctionBegin; 3401 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3402 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3403 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3404 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3405 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3406 3407 PetscFunctionReturn(0); 3408 } 3409 3410 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3411 { 3412 PetscErrorCode ierr; 3413 cusparseStatus_t stat; 3414 3415 PetscFunctionBegin; 3416 if (*cusparsestruct) { 3417 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3418 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3419 delete (*cusparsestruct)->workVector; 3420 delete (*cusparsestruct)->rowoffsets_gpu; 3421 delete (*cusparsestruct)->cooPerm; 3422 delete (*cusparsestruct)->cooPerm_a; 3423 delete (*cusparsestruct)->csr2csc_i; 3424 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3425 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3426 } 3427 PetscFunctionReturn(0); 3428 } 3429 3430 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3431 { 3432 PetscFunctionBegin; 3433 if (*mat) { 3434 delete (*mat)->values; 3435 delete (*mat)->column_indices; 3436 delete (*mat)->row_offsets; 3437 delete *mat; 3438 *mat = 0; 3439 } 3440 PetscFunctionReturn(0); 3441 } 3442 3443 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3444 { 3445 cusparseStatus_t stat; 3446 PetscErrorCode ierr; 3447 3448 PetscFunctionBegin; 3449 if (*trifactor) { 3450 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3451 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3452 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3453 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3454 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3455 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3456 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3457 #endif 3458 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3459 } 3460 PetscFunctionReturn(0); 3461 } 3462 3463 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3464 { 3465 CsrMatrix *mat; 3466 cusparseStatus_t stat; 3467 cudaError_t err; 3468 3469 PetscFunctionBegin; 3470 if (*matstruct) { 3471 if ((*matstruct)->mat) { 3472 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3473 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3474 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3475 #else 3476 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3477 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3478 #endif 3479 } else { 3480 mat = (CsrMatrix*)(*matstruct)->mat; 3481 CsrMatrix_Destroy(&mat); 3482 } 3483 } 3484 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3485 delete (*matstruct)->cprowIndices; 3486 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3487 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3488 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3489 3490 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3491 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3492 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3493 for (int i=0; i<3; i++) { 3494 if (mdata->cuSpMV[i].initialized) { 3495 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3496 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3497 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3498 } 3499 } 3500 #endif 3501 delete *matstruct; 3502 *matstruct = NULL; 3503 } 3504 PetscFunctionReturn(0); 3505 } 3506 3507 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3508 { 3509 PetscErrorCode ierr; 3510 3511 PetscFunctionBegin; 3512 if (*trifactors) { 3513 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3514 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3515 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3516 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3517 delete (*trifactors)->rpermIndices; 3518 delete (*trifactors)->cpermIndices; 3519 delete (*trifactors)->workVector; 3520 (*trifactors)->rpermIndices = NULL; 3521 (*trifactors)->cpermIndices = NULL; 3522 (*trifactors)->workVector = NULL; 3523 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3524 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3525 } 3526 PetscFunctionReturn(0); 3527 } 3528 3529 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3530 { 3531 PetscErrorCode ierr; 3532 cusparseHandle_t handle; 3533 cusparseStatus_t stat; 3534 3535 PetscFunctionBegin; 3536 if (*trifactors) { 3537 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3538 if (handle = (*trifactors)->handle) { 3539 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3540 } 3541 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3542 } 3543 PetscFunctionReturn(0); 3544 } 3545 3546 struct IJCompare 3547 { 3548 __host__ __device__ 3549 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3550 { 3551 if (t1.get<0>() < t2.get<0>()) return true; 3552 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3553 return false; 3554 } 3555 }; 3556 3557 struct IJEqual 3558 { 3559 __host__ __device__ 3560 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3561 { 3562 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3563 return true; 3564 } 3565 }; 3566 3567 struct IJDiff 3568 { 3569 __host__ __device__ 3570 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3571 { 3572 return t1 == t2 ? 0 : 1; 3573 } 3574 }; 3575 3576 struct IJSum 3577 { 3578 __host__ __device__ 3579 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3580 { 3581 return t1||t2; 3582 } 3583 }; 3584 3585 #include <thrust/iterator/discard_iterator.h> 3586 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3587 { 3588 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3589 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3590 THRUSTARRAY *cooPerm_v = NULL; 3591 thrust::device_ptr<const PetscScalar> d_v; 3592 CsrMatrix *matrix; 3593 PetscErrorCode ierr; 3594 cudaError_t cerr; 3595 PetscInt n; 3596 3597 PetscFunctionBegin; 3598 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3599 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3600 if (!cusp->cooPerm) { 3601 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3602 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3603 PetscFunctionReturn(0); 3604 } 3605 matrix = (CsrMatrix*)cusp->mat->mat; 3606 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3607 if (!v) { 3608 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3609 goto finalize; 3610 } 3611 n = cusp->cooPerm->size(); 3612 if (isCudaMem(v)) { 3613 d_v = thrust::device_pointer_cast(v); 3614 } else { 3615 cooPerm_v = new THRUSTARRAY(n); 3616 cooPerm_v->assign(v,v+n); 3617 d_v = cooPerm_v->data(); 3618 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3619 } 3620 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3621 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3622 if (cusp->cooPerm_a) { 3623 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3624 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3625 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3626 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3627 delete cooPerm_w; 3628 } else { 3629 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3630 matrix->values->begin())); 3631 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3632 matrix->values->end())); 3633 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3634 } 3635 } else { 3636 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3637 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3638 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3639 } else { 3640 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3641 matrix->values->begin())); 3642 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3643 matrix->values->end())); 3644 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3645 } 3646 } 3647 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3648 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3649 finalize: 3650 delete cooPerm_v; 3651 A->offloadmask = PETSC_OFFLOAD_GPU; 3652 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3653 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3654 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3655 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3656 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3657 a->reallocs = 0; 3658 A->info.mallocs += 0; 3659 A->info.nz_unneeded = 0; 3660 A->assembled = A->was_assembled = PETSC_TRUE; 3661 A->num_ass++; 3662 PetscFunctionReturn(0); 3663 } 3664 3665 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3666 { 3667 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3668 PetscErrorCode ierr; 3669 3670 PetscFunctionBegin; 3671 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3672 if (!cusp) PetscFunctionReturn(0); 3673 if (destroy) { 3674 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3675 delete cusp->csr2csc_i; 3676 cusp->csr2csc_i = NULL; 3677 } 3678 A->transupdated = PETSC_FALSE; 3679 PetscFunctionReturn(0); 3680 } 3681 3682 #include <thrust/binary_search.h> 3683 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3684 { 3685 PetscErrorCode ierr; 3686 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3687 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3688 PetscInt cooPerm_n, nzr = 0; 3689 cudaError_t cerr; 3690 3691 PetscFunctionBegin; 3692 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3693 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3694 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3695 if (n != cooPerm_n) { 3696 delete cusp->cooPerm; 3697 delete cusp->cooPerm_a; 3698 cusp->cooPerm = NULL; 3699 cusp->cooPerm_a = NULL; 3700 } 3701 if (n) { 3702 THRUSTINTARRAY d_i(n); 3703 THRUSTINTARRAY d_j(n); 3704 THRUSTINTARRAY ii(A->rmap->n); 3705 3706 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3707 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3708 3709 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3710 d_i.assign(coo_i,coo_i+n); 3711 d_j.assign(coo_j,coo_j+n); 3712 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3713 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3714 3715 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3716 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3717 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3718 *cusp->cooPerm_a = d_i; 3719 THRUSTINTARRAY w = d_j; 3720 3721 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3722 if (nekey == ekey) { /* all entries are unique */ 3723 delete cusp->cooPerm_a; 3724 cusp->cooPerm_a = NULL; 3725 } else { /* I couldn't come up with a more elegant algorithm */ 3726 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3727 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3728 (*cusp->cooPerm_a)[0] = 0; 3729 w[0] = 0; 3730 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3731 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3732 } 3733 thrust::counting_iterator<PetscInt> search_begin(0); 3734 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3735 search_begin, search_begin + A->rmap->n, 3736 ii.begin()); 3737 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3738 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3739 3740 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3741 a->singlemalloc = PETSC_FALSE; 3742 a->free_a = PETSC_TRUE; 3743 a->free_ij = PETSC_TRUE; 3744 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3745 a->i[0] = 0; 3746 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3747 a->nz = a->maxnz = a->i[A->rmap->n]; 3748 a->rmax = 0; 3749 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3750 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3751 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3752 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3753 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3754 for (PetscInt i = 0; i < A->rmap->n; i++) { 3755 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3756 nzr += (PetscInt)!!(nnzr); 3757 a->ilen[i] = a->imax[i] = nnzr; 3758 a->rmax = PetscMax(a->rmax,nnzr); 3759 } 3760 a->nonzerorowcnt = nzr; 3761 A->preallocated = PETSC_TRUE; 3762 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3763 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3764 } else { 3765 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3766 } 3767 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3768 3769 /* We want to allocate the CUSPARSE struct for matvec now. 3770 The code is so convoluted now that I prefer to copy zeros */ 3771 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3772 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3773 A->offloadmask = PETSC_OFFLOAD_CPU; 3774 A->nonzerostate++; 3775 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3776 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3777 3778 A->assembled = PETSC_FALSE; 3779 A->was_assembled = PETSC_FALSE; 3780 PetscFunctionReturn(0); 3781 } 3782 3783 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3784 { 3785 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3786 CsrMatrix *csr; 3787 PetscErrorCode ierr; 3788 3789 PetscFunctionBegin; 3790 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3791 PetscValidPointer(a,2); 3792 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3793 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3794 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3795 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3796 csr = (CsrMatrix*)cusp->mat->mat; 3797 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3798 *a = csr->values->data().get(); 3799 PetscFunctionReturn(0); 3800 } 3801 3802 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3803 { 3804 PetscFunctionBegin; 3805 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3806 PetscValidPointer(a,2); 3807 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3808 *a = NULL; 3809 PetscFunctionReturn(0); 3810 } 3811 3812 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3813 { 3814 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3815 CsrMatrix *csr; 3816 PetscErrorCode ierr; 3817 3818 PetscFunctionBegin; 3819 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3820 PetscValidPointer(a,2); 3821 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3822 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3823 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3824 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3825 csr = (CsrMatrix*)cusp->mat->mat; 3826 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3827 *a = csr->values->data().get(); 3828 A->offloadmask = PETSC_OFFLOAD_GPU; 3829 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3830 PetscFunctionReturn(0); 3831 } 3832 3833 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3834 { 3835 PetscErrorCode ierr; 3836 3837 PetscFunctionBegin; 3838 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3839 PetscValidPointer(a,2); 3840 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3841 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3842 *a = NULL; 3843 PetscFunctionReturn(0); 3844 } 3845 3846 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3847 { 3848 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3849 CsrMatrix *csr; 3850 PetscErrorCode ierr; 3851 3852 PetscFunctionBegin; 3853 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3854 PetscValidPointer(a,2); 3855 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3856 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3857 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3858 csr = (CsrMatrix*)cusp->mat->mat; 3859 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3860 *a = csr->values->data().get(); 3861 A->offloadmask = PETSC_OFFLOAD_GPU; 3862 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3863 PetscFunctionReturn(0); 3864 } 3865 3866 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3867 { 3868 PetscErrorCode ierr; 3869 3870 PetscFunctionBegin; 3871 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3872 PetscValidPointer(a,2); 3873 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3874 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3875 *a = NULL; 3876 PetscFunctionReturn(0); 3877 } 3878 3879 struct IJCompare4 3880 { 3881 __host__ __device__ 3882 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3883 { 3884 if (t1.get<0>() < t2.get<0>()) return true; 3885 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3886 return false; 3887 } 3888 }; 3889 3890 struct Shift 3891 { 3892 int _shift; 3893 3894 Shift(int shift) : _shift(shift) {} 3895 __host__ __device__ 3896 inline int operator() (const int &c) 3897 { 3898 return c + _shift; 3899 } 3900 }; 3901 3902 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3903 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3904 { 3905 PetscErrorCode ierr; 3906 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3907 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3908 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3909 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3910 PetscInt Annz,Bnnz; 3911 cusparseStatus_t stat; 3912 PetscInt i,m,n,zero = 0; 3913 cudaError_t cerr; 3914 3915 PetscFunctionBegin; 3916 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3917 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3918 PetscValidPointer(C,4); 3919 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3920 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3921 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3922 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3923 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3924 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3925 if (reuse == MAT_INITIAL_MATRIX) { 3926 m = A->rmap->n; 3927 n = A->cmap->n + B->cmap->n; 3928 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3929 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3930 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3931 c = (Mat_SeqAIJ*)(*C)->data; 3932 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3933 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3934 Ccsr = new CsrMatrix; 3935 Cmat->cprowIndices = NULL; 3936 c->compressedrow.use = PETSC_FALSE; 3937 c->compressedrow.nrows = 0; 3938 c->compressedrow.i = NULL; 3939 c->compressedrow.rindex = NULL; 3940 Ccusp->workVector = NULL; 3941 Ccusp->nrows = m; 3942 Ccusp->mat = Cmat; 3943 Ccusp->mat->mat = Ccsr; 3944 Ccsr->num_rows = m; 3945 Ccsr->num_cols = n; 3946 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3947 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3948 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3949 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3950 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3951 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3952 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3953 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3954 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3955 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3956 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3957 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 3958 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3959 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3960 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3961 3962 Acsr = (CsrMatrix*)Acusp->mat->mat; 3963 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3964 Annz = (PetscInt)Acsr->column_indices->size(); 3965 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3966 c->nz = Annz + Bnnz; 3967 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3968 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3969 Ccsr->values = new THRUSTARRAY(c->nz); 3970 Ccsr->num_entries = c->nz; 3971 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3972 if (c->nz) { 3973 auto Acoo = new THRUSTINTARRAY32(Annz); 3974 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3975 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3976 THRUSTINTARRAY32 *Aroff,*Broff; 3977 3978 if (a->compressedrow.use) { /* need full row offset */ 3979 if (!Acusp->rowoffsets_gpu) { 3980 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3981 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3982 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3983 } 3984 Aroff = Acusp->rowoffsets_gpu; 3985 } else Aroff = Acsr->row_offsets; 3986 if (b->compressedrow.use) { /* need full row offset */ 3987 if (!Bcusp->rowoffsets_gpu) { 3988 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3989 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3990 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3991 } 3992 Broff = Bcusp->rowoffsets_gpu; 3993 } else Broff = Bcsr->row_offsets; 3994 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3995 stat = cusparseXcsr2coo(Acusp->handle, 3996 Aroff->data().get(), 3997 Annz, 3998 m, 3999 Acoo->data().get(), 4000 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4001 stat = cusparseXcsr2coo(Bcusp->handle, 4002 Broff->data().get(), 4003 Bnnz, 4004 m, 4005 Bcoo->data().get(), 4006 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4007 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4008 auto Aperm = thrust::make_constant_iterator(1); 4009 auto Bperm = thrust::make_constant_iterator(0); 4010 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4011 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4012 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4013 #else 4014 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4015 auto Bcib = Bcsr->column_indices->begin(); 4016 auto Bcie = Bcsr->column_indices->end(); 4017 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4018 #endif 4019 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4020 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4021 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4022 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4023 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4024 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4025 auto p1 = Ccusp->cooPerm->begin(); 4026 auto p2 = Ccusp->cooPerm->begin(); 4027 thrust::advance(p2,Annz); 4028 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4029 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4030 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4031 #endif 4032 auto cci = thrust::make_counting_iterator(zero); 4033 auto cce = thrust::make_counting_iterator(c->nz); 4034 #if 0 //Errors on SUMMIT cuda 11.1.0 4035 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4036 #else 4037 auto pred = thrust::identity<int>(); 4038 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4039 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4040 #endif 4041 stat = cusparseXcoo2csr(Ccusp->handle, 4042 Ccoo->data().get(), 4043 c->nz, 4044 m, 4045 Ccsr->row_offsets->data().get(), 4046 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4047 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4048 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4049 delete wPerm; 4050 delete Acoo; 4051 delete Bcoo; 4052 delete Ccoo; 4053 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4054 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4055 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4056 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4057 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4058 #endif 4059 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4060 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4061 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4062 CsrMatrix *CcsrT = new CsrMatrix; 4063 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4064 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4065 4066 (*C)->form_explicit_transpose = PETSC_TRUE; 4067 (*C)->transupdated = PETSC_TRUE; 4068 Ccusp->rowoffsets_gpu = NULL; 4069 CmatT->cprowIndices = NULL; 4070 CmatT->mat = CcsrT; 4071 CcsrT->num_rows = n; 4072 CcsrT->num_cols = m; 4073 CcsrT->num_entries = c->nz; 4074 4075 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4076 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4077 CcsrT->values = new THRUSTARRAY(c->nz); 4078 4079 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4080 auto rT = CcsrT->row_offsets->begin(); 4081 if (AT) { 4082 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4083 thrust::advance(rT,-1); 4084 } 4085 if (BT) { 4086 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4087 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4088 thrust::copy(titb,tite,rT); 4089 } 4090 auto cT = CcsrT->column_indices->begin(); 4091 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4092 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4093 auto vT = CcsrT->values->begin(); 4094 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4095 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4096 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4097 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4098 4099 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4100 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4101 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4102 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4103 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4104 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4105 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4106 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4107 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4108 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4109 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4110 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4111 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4112 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4113 #endif 4114 Ccusp->matTranspose = CmatT; 4115 } 4116 } 4117 4118 c->singlemalloc = PETSC_FALSE; 4119 c->free_a = PETSC_TRUE; 4120 c->free_ij = PETSC_TRUE; 4121 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4122 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4123 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4124 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4125 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4126 ii = *Ccsr->row_offsets; 4127 jj = *Ccsr->column_indices; 4128 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4129 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4130 } else { 4131 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4132 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4133 } 4134 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4135 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4136 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4137 c->maxnz = c->nz; 4138 c->nonzerorowcnt = 0; 4139 c->rmax = 0; 4140 for (i = 0; i < m; i++) { 4141 const PetscInt nn = c->i[i+1] - c->i[i]; 4142 c->ilen[i] = c->imax[i] = nn; 4143 c->nonzerorowcnt += (PetscInt)!!nn; 4144 c->rmax = PetscMax(c->rmax,nn); 4145 } 4146 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4147 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4148 (*C)->nonzerostate++; 4149 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4150 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4151 Ccusp->nonzerostate = (*C)->nonzerostate; 4152 (*C)->preallocated = PETSC_TRUE; 4153 } else { 4154 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4155 c = (Mat_SeqAIJ*)(*C)->data; 4156 if (c->nz) { 4157 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4158 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4159 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4160 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4161 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4162 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4163 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4164 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4165 Acsr = (CsrMatrix*)Acusp->mat->mat; 4166 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4167 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4168 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4169 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4170 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4171 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4172 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4173 auto pmid = Ccusp->cooPerm->begin(); 4174 thrust::advance(pmid,Acsr->num_entries); 4175 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4176 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4177 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4178 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4179 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4180 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4181 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4182 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4183 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4184 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4185 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4186 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4187 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4188 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4189 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4190 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4191 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4192 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4193 auto vT = CcsrT->values->begin(); 4194 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4195 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4196 (*C)->transupdated = PETSC_TRUE; 4197 } 4198 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4199 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4200 } 4201 } 4202 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4203 (*C)->assembled = PETSC_TRUE; 4204 (*C)->was_assembled = PETSC_FALSE; 4205 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4206 PetscFunctionReturn(0); 4207 } 4208 4209 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4210 { 4211 PetscErrorCode ierr; 4212 bool dmem; 4213 const PetscScalar *av; 4214 cudaError_t cerr; 4215 4216 PetscFunctionBegin; 4217 dmem = isCudaMem(v); 4218 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4219 if (n && idx) { 4220 THRUSTINTARRAY widx(n); 4221 widx.assign(idx,idx+n); 4222 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4223 4224 THRUSTARRAY *w = NULL; 4225 thrust::device_ptr<PetscScalar> dv; 4226 if (dmem) { 4227 dv = thrust::device_pointer_cast(v); 4228 } else { 4229 w = new THRUSTARRAY(n); 4230 dv = w->data(); 4231 } 4232 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4233 4234 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4235 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4236 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4237 if (w) { 4238 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4239 } 4240 delete w; 4241 } else { 4242 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4243 } 4244 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4245 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4246 PetscFunctionReturn(0); 4247 } 4248 4249 /* 4250 LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields) 4251 4252 requires: 4253 structurally symmetric: fix with transpose/column meta data 4254 */ 4255 4256 /* 4257 The GPU LU factor kernel 4258 */ 4259 __global__ 4260 void __launch_bounds__(1024,1) 4261 mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[]) 4262 { 4263 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4264 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4265 const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 4266 4267 // set i (row+1) 4268 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero 4269 // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block 4270 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4271 if (rowb < end_i && threadIdx.x==0) { 4272 PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0; 4273 bi_csr[rowb+1] = n1L + nug - clip + n2L + i; 4274 } 4275 } 4276 } 4277 // copy AIJ to AIJ_BAND 4278 __global__ 4279 void __launch_bounds__(1024,1) 4280 mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[], 4281 const int ai_d[], const int aj_d[], const PetscScalar aa_d[], 4282 const int bi_csr[], PetscScalar ba_csr[]) 4283 { 4284 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4285 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4286 const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 4287 4288 // zero B 4289 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end 4290 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4291 if (rowb < end_i) { 4292 PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4293 const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb]; 4294 for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) { 4295 if (j<nzb) { 4296 batmp[j] = 0; 4297 } 4298 } 4299 } 4300 } 4301 4302 // copy A into B with CSR format -- these two loops can be fused 4303 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4304 if (rowb < end_i) { 4305 const PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 4306 const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0; 4307 const PetscScalar *av = aa_d + ai_d[rowa]; 4308 PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4309 /* load in initial (unfactored row) */ 4310 for (int j=threadIdx.x ; j<nza ; j += blockDim.x) { 4311 if (j<nza) { 4312 PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart; 4313 PetscScalar vala = av[j]; 4314 batmp[idx] = vala; 4315 } 4316 } 4317 } 4318 } 4319 } 4320 // print AIJ_BAND 4321 __global__ 4322 void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[]) 4323 { 4324 // debug 4325 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){ 4326 printf("B (AIJ) n=%d:\n",(int)n); 4327 for (int rowb=0;rowb<n;rowb++) { 4328 const PetscInt nz = bi_csr[rowb+1] - bi_csr[rowb]; 4329 const PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4330 for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j])); 4331 printf(" bi=%d\n",bi_csr[rowb+1]); 4332 } 4333 } 4334 } 4335 // Band LU kernel --- ba_csr bi_csr 4336 __global__ 4337 void __launch_bounds__(1024,1) 4338 mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[]) 4339 { 4340 extern __shared__ PetscInt smemInt[]; 4341 PetscInt *sm_pkIdx = &smemInt[0]; 4342 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4343 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4344 const PetscInt start = field*nloc, end = start + nloc; 4345 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4346 auto g = cooperative_groups::this_grid(); 4347 #endif 4348 // A22 panel update for each row A(1,:) and col A(:,1) 4349 for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) { 4350 PetscInt tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears 4351 const PetscInt nzUd = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first 4352 const PetscInt nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y)); 4353 PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset; 4354 const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end) 4355 const PetscScalar Bdd = *pBdd; 4356 const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y; 4357 for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */ 4358 if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */ 4359 const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block 4360 PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx; 4361 *Aid = *Aid/Bdd; 4362 sm_pkIdx[threadIdx.y] = kIdx; 4363 } 4364 __syncthreads(); // synch on threadIdx.x only 4365 if (idx < nzUd) { /* assuming symmetric structure */ 4366 PetscInt kIdx = sm_pkIdx[threadIdx.y]; 4367 PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx; 4368 PetscScalar *Aij = Aid + 1; 4369 PetscScalar Lid = *Aid; 4370 for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) { 4371 if (jIdx<nzUd) { 4372 Aij[jIdx] -= Lid*baUd[jIdx]; 4373 } 4374 } 4375 } 4376 } 4377 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4378 g.sync(); 4379 #else 4380 __syncthreads(); 4381 #endif 4382 } /* endof for (i=0; i<n; i++) { */ 4383 } 4384 4385 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec); 4386 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info) 4387 { 4388 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 4389 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 4390 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 4391 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 4392 Mat_SeqAIJCUSPARSEMultStruct *matstructA; 4393 CsrMatrix *matrixA; 4394 PetscErrorCode ierr; 4395 cudaError_t cerr; 4396 const PetscInt n=A->rmap->n, *ic, *r; 4397 const int *ai_d, *aj_d; 4398 const PetscScalar *aa_d; 4399 PetscScalar *ba_t = cusparseTriFactors->a_band_d; 4400 int *bi_t = cusparseTriFactors->i_band_d; 4401 PetscContainer container; 4402 int Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1; 4403 4404 PetscFunctionBegin; 4405 if (A->rmap->n == 0) { 4406 PetscFunctionReturn(0); 4407 } 4408 // cusparse setup 4409 if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA"); 4410 matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; // matstruct->cprowIndices 4411 if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 4412 matrixA = (CsrMatrix*)matstructA->mat; 4413 if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat"); 4414 4415 // factor: get Nf if available 4416 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4417 if (container) { 4418 PetscInt *pNf=NULL; 4419 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4420 Nf = (*pNf)%1000; 4421 if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use 4422 } else Nf = 1; 4423 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4424 4425 // get data 4426 ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data()); 4427 ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 4428 aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data()); 4429 aa_d = thrust::raw_pointer_cast(matrixA->values->data().get()); 4430 r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data()); 4431 4432 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4433 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4434 { 4435 int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf; 4436 int gpuid; 4437 cudaDeviceProp prop; 4438 cudaGetDevice(&gpuid); 4439 cudaGetDeviceProperties(&prop, gpuid); 4440 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 4441 Ni = 1/nconcurrent; 4442 Ni = 1; 4443 #else 4444 nsm = prop.multiProcessorCount; 4445 Ni = nsm/Nf/nconcurrent; 4446 #endif 4447 team_size = bw/Ni + !!(bw%Ni); 4448 nVec = PetscMin(bw, 1024/team_size); 4449 ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr); 4450 { 4451 dim3 dimBlockTeam(nVec,team_size); 4452 dim3 dimBlockLeague(Nf,Ni); 4453 mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t); 4454 CHECK_LAUNCH_ERROR(); // does a sync 4455 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4456 void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t}; 4457 cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL); 4458 #else 4459 mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t); 4460 #endif 4461 CHECK_LAUNCH_ERROR(); // does a sync 4462 #if defined(PETSC_USE_LOG) 4463 ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr); 4464 #endif 4465 } 4466 } 4467 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4468 4469 /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */ 4470 B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND; 4471 B->ops->solvetranspose = NULL; // need transpose 4472 B->ops->matsolve = NULL; 4473 B->ops->matsolvetranspose = NULL; 4474 4475 PetscFunctionReturn(0); 4476 } 4477 4478 static PetscErrorCode MatrixNfDestroy(void *ptr) 4479 { 4480 PetscInt *nf = (PetscInt *)ptr; 4481 PetscErrorCode ierr; 4482 PetscFunctionBegin; 4483 ierr = PetscFree(nf);CHKERRQ(ierr); 4484 PetscFunctionReturn(0); 4485 } 4486 4487 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4488 { 4489 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b; 4490 IS isicol; 4491 PetscErrorCode ierr; 4492 cudaError_t cerr; 4493 const PetscInt *ic,*ai=a->i,*aj=a->j; 4494 PetscScalar *ba_t; 4495 int *bi_t; 4496 PetscInt i,n=A->rmap->n,Nf; 4497 PetscInt nzBcsr,bwL,bwU; 4498 PetscBool missing; 4499 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 4500 PetscContainer container; 4501 4502 PetscFunctionBegin; 4503 if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square"); 4504 ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr); 4505 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i); 4506 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors"); 4507 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr); 4508 if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported"); 4509 4510 // factor: get Nf if available 4511 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4512 if (container) { 4513 PetscInt *pNf=NULL; 4514 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4515 Nf = (*pNf)%1000; 4516 ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr); 4517 ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr); 4518 *pNf = Nf; 4519 ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr); 4520 ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr); 4521 ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr); 4522 ierr = PetscContainerDestroy(&container);CHKERRQ(ierr); 4523 } else Nf = 1; 4524 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4525 4526 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4527 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4528 4529 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 4530 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr); 4531 b = (Mat_SeqAIJ*)(B)->data; 4532 4533 /* get band widths, MatComputeBandwidth should take a reordering ic and do this */ 4534 bwL = bwU = 0; 4535 for (int rwb=0; rwb<n; rwb++) { 4536 const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb]; 4537 for (int j=0;j<anz;j++) { 4538 PetscInt colb = ic[ajtmp[j]]; 4539 if (colb<rwa) { // L 4540 if (rwa-colb > bwL) bwL = rwa-colb; 4541 } else { 4542 if (colb-rwa > bwU) bwU = colb-rwa; 4543 } 4544 } 4545 } 4546 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4547 /* only support structurally symmetric, but it might work */ 4548 if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU); 4549 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 4550 nzBcsr = n + (2*n-1)*bwU - bwU*bwU; 4551 b->maxnz = b->nz = nzBcsr; 4552 cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz 4553 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 4554 cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops 4555 cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr); 4556 cusparseTriFactors->a_band_d = ba_t; 4557 cusparseTriFactors->i_band_d = bi_t; 4558 /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */ 4559 ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr); 4560 { 4561 dim3 dimBlockTeam(1,128); 4562 dim3 dimBlockLeague(Nf,1); 4563 mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t); 4564 } 4565 CHECK_LAUNCH_ERROR(); // does a sync 4566 4567 // setup data 4568 if (!cusparseTriFactors->rpermIndices) { 4569 const PetscInt *r; 4570 4571 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4572 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 4573 cusparseTriFactors->rpermIndices->assign(r, r+n); 4574 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4575 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4576 } 4577 /* upper triangular indices */ 4578 if (!cusparseTriFactors->cpermIndices) { 4579 const PetscInt *c; 4580 4581 ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr); 4582 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 4583 cusparseTriFactors->cpermIndices->assign(c, c+n); 4584 ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr); 4585 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4586 } 4587 4588 /* put together the new matrix */ 4589 b->free_a = PETSC_FALSE; 4590 b->free_ij = PETSC_FALSE; 4591 b->singlemalloc = PETSC_FALSE; 4592 b->ilen = NULL; 4593 b->imax = NULL; 4594 b->row = isrow; 4595 b->col = iscol; 4596 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4597 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4598 b->icol = isicol; 4599 ierr = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr); 4600 4601 B->factortype = MAT_FACTOR_LU; 4602 B->info.factor_mallocs = 0; 4603 B->info.fill_ratio_given = 0; 4604 4605 if (ai[n]) { 4606 B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]); 4607 } else { 4608 B->info.fill_ratio_needed = 0.0; 4609 } 4610 #if defined(PETSC_USE_INFO) 4611 if (ai[n] != 0) { 4612 PetscReal af = B->info.fill_ratio_needed; 4613 ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr); 4614 } else { 4615 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 4616 } 4617 #endif 4618 if (a->inode.size) { 4619 ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr); 4620 } 4621 ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr); 4622 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND; 4623 B->offloadmask = PETSC_OFFLOAD_GPU; 4624 4625 PetscFunctionReturn(0); 4626 } 4627 4628 /* Use -pc_factor_mat_solver_type cusparseband */ 4629 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type) 4630 { 4631 PetscFunctionBegin; 4632 *type = MATSOLVERCUSPARSEBAND; 4633 PetscFunctionReturn(0); 4634 } 4635 4636 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B) 4637 { 4638 PetscErrorCode ierr; 4639 PetscInt n = A->rmap->n; 4640 4641 PetscFunctionBegin; 4642 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 4643 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 4644 (*B)->factortype = ftype; 4645 (*B)->canuseordering = PETSC_TRUE; 4646 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4647 4648 if (ftype == MAT_FACTOR_LU) { 4649 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 4650 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 4651 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND; 4652 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types"); 4653 4654 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 4655 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr); 4656 PetscFunctionReturn(0); 4657 } 4658 4659 #define WARP_SIZE 32 4660 template <typename T> 4661 __forceinline__ __device__ 4662 T wreduce(T a) 4663 { 4664 T b; 4665 #pragma unroll 4666 for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) { 4667 b = __shfl_down_sync(0xffffffff, a, i); 4668 a += b; 4669 } 4670 return a; 4671 } 4672 // reduce in a block, returns result in thread 0 4673 template <typename T, int BLOCK_SIZE> 4674 __device__ 4675 T breduce(T a) 4676 { 4677 constexpr int NWARP = BLOCK_SIZE/WARP_SIZE; 4678 __shared__ double buf[NWARP]; 4679 int wid = threadIdx.x / WARP_SIZE; 4680 int laneid = threadIdx.x % WARP_SIZE; 4681 T b = wreduce<T>(a); 4682 if (laneid == 0) 4683 buf[wid] = b; 4684 __syncthreads(); 4685 if (wid == 0) { 4686 if (threadIdx.x < NWARP) 4687 a = buf[threadIdx.x]; 4688 else 4689 a = 0; 4690 for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) { 4691 a += __shfl_down_sync(0xffffffff, a, i); 4692 } 4693 } 4694 return a; 4695 } 4696 4697 4698 // Band LU kernel --- ba_csr bi_csr 4699 template <int BLOCK_SIZE> 4700 __global__ 4701 void __launch_bounds__(256,1) 4702 mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[]) 4703 { 4704 const PetscInt Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz; 4705 const PetscScalar *pLi; 4706 const int tid = threadIdx.x; 4707 4708 /* Next, solve L */ 4709 pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field 4710 for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) { 4711 const PetscInt col = locDD<bw ? start : (glbDD-bw); 4712 PetscScalar t = 0; 4713 for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) { 4714 t += pLi[idx]*x[j]; 4715 } 4716 #if defined(PETSC_USE_COMPLEX) 4717 PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t); 4718 PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti)); 4719 t = tt; 4720 #else 4721 t = breduce<PetscReal,BLOCK_SIZE>(t); 4722 #endif 4723 if (threadIdx.x == 0) 4724 x[glbDD] -= t; // /1.0 4725 __syncthreads(); 4726 // inc 4727 pLi += glbDD-col; // get to diagonal 4728 if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset 4729 else pLi += bw; 4730 pLi += 1; // skip to next row 4731 if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear) 4732 } 4733 /* Then, solve U */ 4734 pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal) 4735 if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row 4736 for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) { 4737 const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U 4738 PetscScalar t = 0; 4739 for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) { 4740 t += pLi[-idx]*x[j]; 4741 } 4742 #if defined(PETSC_USE_COMPLEX) 4743 PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t); 4744 PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti)); 4745 t = tt; 4746 #else 4747 t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t)); 4748 #endif 4749 pLi -= col-glbDD; // diagonal 4750 if (threadIdx.x == 0) { 4751 x[glbDD] -= t; 4752 x[glbDD] /= pLi[0]; 4753 } 4754 __syncthreads(); 4755 // inc past L to start of previous U 4756 pLi -= bw+1; 4757 if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner 4758 if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner 4759 } 4760 } 4761 4762 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx) 4763 { 4764 const PetscScalar *barray; 4765 PetscScalar *xarray; 4766 thrust::device_ptr<const PetscScalar> bGPU; 4767 thrust::device_ptr<PetscScalar> xGPU; 4768 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4769 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 4770 PetscInt n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf; 4771 PetscErrorCode ierr; 4772 cudaError_t cerr; 4773 PetscContainer container; 4774 4775 PetscFunctionBegin; 4776 if (A->rmap->n == 0) { 4777 PetscFunctionReturn(0); 4778 } 4779 // factor: get Nf if available 4780 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4781 if (container) { 4782 PetscInt *pNf=NULL; 4783 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4784 Nf = (*pNf)%1000; 4785 } else Nf = 1; 4786 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4787 4788 /* Get the GPU pointers */ 4789 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 4790 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 4791 xGPU = thrust::device_pointer_cast(xarray); 4792 bGPU = thrust::device_pointer_cast(barray); 4793 4794 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4795 /* First, reorder with the row permutation */ 4796 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 4797 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 4798 tempGPU->begin()); 4799 constexpr int block = 128; 4800 mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get()); 4801 CHECK_LAUNCH_ERROR(); // does a sync 4802 4803 /* Last, reorder with the column permutation */ 4804 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 4805 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 4806 xGPU); 4807 4808 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 4809 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 4810 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4811 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4812 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 4813 PetscFunctionReturn(0); 4814 } 4815