1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 #include <thrust/async/for_each.h> 17 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 18 #include <cooperative_groups.h> 19 #endif 20 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 21 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 22 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 23 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 24 25 typedef enum { 26 CUSPARSE_MV_ALG_DEFAULT = 0, 27 CUSPARSE_COOMV_ALG = 1, 28 CUSPARSE_CSRMV_ALG1 = 2, 29 CUSPARSE_CSRMV_ALG2 = 3 30 } cusparseSpMVAlg_t; 31 32 typedef enum { 33 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 34 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 35 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 36 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 37 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 38 CUSPARSE_SPMM_ALG_DEFAULT = 0, 39 CUSPARSE_SPMM_COO_ALG1 = 1, 40 CUSPARSE_SPMM_COO_ALG2 = 2, 41 CUSPARSE_SPMM_COO_ALG3 = 3, 42 CUSPARSE_SPMM_COO_ALG4 = 5, 43 CUSPARSE_SPMM_CSR_ALG1 = 4, 44 CUSPARSE_SPMM_CSR_ALG2 = 6, 45 } cusparseSpMMAlg_t; 46 47 typedef enum { 48 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 49 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 50 } cusparseCsr2CscAlg_t; 51 */ 52 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 53 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 54 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 55 #endif 56 57 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 58 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 59 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 60 61 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*); 62 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*); 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 88 89 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 92 93 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 94 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 95 96 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 97 98 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 99 { 100 cusparseStatus_t stat; 101 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 102 103 PetscFunctionBegin; 104 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 105 cusparsestruct->stream = stream; 106 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 107 PetscFunctionReturn(0); 108 } 109 110 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 111 { 112 cusparseStatus_t stat; 113 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 114 115 PetscFunctionBegin; 116 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 117 if (cusparsestruct->handle != handle) { 118 if (cusparsestruct->handle) { 119 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 120 } 121 cusparsestruct->handle = handle; 122 } 123 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 128 { 129 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 130 PetscBool flg; 131 PetscErrorCode ierr; 132 133 PetscFunctionBegin; 134 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 135 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 136 if (cusparsestruct->handle) cusparsestruct->handle = 0; 137 PetscFunctionReturn(0); 138 } 139 140 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 141 { 142 PetscFunctionBegin; 143 *type = MATSOLVERCUSPARSE; 144 PetscFunctionReturn(0); 145 } 146 147 /*MC 148 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 149 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 150 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 151 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 152 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 153 algorithms are not recommended. This class does NOT support direct solver operations. 154 155 Level: beginner 156 157 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 158 M*/ 159 160 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 161 { 162 PetscErrorCode ierr; 163 PetscInt n = A->rmap->n; 164 165 PetscFunctionBegin; 166 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 167 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 168 (*B)->factortype = ftype; 169 (*B)->useordering = PETSC_TRUE; 170 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 171 172 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 173 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 174 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 175 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 176 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 177 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 178 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 179 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 180 181 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 182 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 183 PetscFunctionReturn(0); 184 } 185 186 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 187 { 188 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 189 190 PetscFunctionBegin; 191 switch (op) { 192 case MAT_CUSPARSE_MULT: 193 cusparsestruct->format = format; 194 break; 195 case MAT_CUSPARSE_ALL: 196 cusparsestruct->format = format; 197 break; 198 default: 199 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 200 } 201 PetscFunctionReturn(0); 202 } 203 204 /*@ 205 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 206 operation. Only the MatMult operation can use different GPU storage formats 207 for MPIAIJCUSPARSE matrices. 208 Not Collective 209 210 Input Parameters: 211 + A - Matrix of type SEQAIJCUSPARSE 212 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 213 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 214 215 Output Parameter: 216 217 Level: intermediate 218 219 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 220 @*/ 221 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 222 { 223 PetscErrorCode ierr; 224 225 PetscFunctionBegin; 226 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 227 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 switch (op) { 237 case MAT_FORM_EXPLICIT_TRANSPOSE: 238 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 239 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 240 A->form_explicit_transpose = flg; 241 break; 242 default: 243 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 244 break; 245 } 246 PetscFunctionReturn(0); 247 } 248 249 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 250 251 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 252 { 253 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 254 IS isrow = b->row,iscol = b->col; 255 PetscBool row_identity,col_identity; 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 260 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 261 B->offloadmask = PETSC_OFFLOAD_CPU; 262 /* determine which version of MatSolve needs to be used. */ 263 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 264 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 265 if (row_identity && col_identity) { 266 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 268 B->ops->matsolve = NULL; 269 B->ops->matsolvetranspose = NULL; 270 } else { 271 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 272 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 273 B->ops->matsolve = NULL; 274 B->ops->matsolvetranspose = NULL; 275 } 276 277 /* get the triangular factors */ 278 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 279 PetscFunctionReturn(0); 280 } 281 282 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 283 { 284 PetscErrorCode ierr; 285 MatCUSPARSEStorageFormat format; 286 PetscBool flg; 287 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 288 289 PetscFunctionBegin; 290 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 291 if (A->factortype == MAT_FACTOR_NONE) { 292 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 293 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 294 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 295 296 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 297 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 298 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 299 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 300 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 301 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 302 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 303 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304 305 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 306 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 307 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 308 309 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 310 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 311 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 312 #endif 313 } 314 ierr = PetscOptionsTail();CHKERRQ(ierr); 315 PetscFunctionReturn(0); 316 } 317 318 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 319 { 320 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 321 PetscErrorCode ierr; 322 323 PetscFunctionBegin; 324 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 325 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 326 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 327 PetscFunctionReturn(0); 328 } 329 330 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 331 { 332 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 333 PetscErrorCode ierr; 334 335 PetscFunctionBegin; 336 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 337 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 338 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 339 PetscFunctionReturn(0); 340 } 341 342 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 343 { 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 345 PetscErrorCode ierr; 346 347 PetscFunctionBegin; 348 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 349 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 350 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 351 PetscFunctionReturn(0); 352 } 353 354 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 355 { 356 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 357 PetscErrorCode ierr; 358 359 PetscFunctionBegin; 360 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 361 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 362 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 363 PetscFunctionReturn(0); 364 } 365 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *ai = a->i,*aj = a->j,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiLo, *AjLo; 376 PetscInt i,nz, nzLower, offset, rowOffset; 377 PetscErrorCode ierr; 378 cudaError_t cerr; 379 380 PetscFunctionBegin; 381 if (!n) PetscFunctionReturn(0); 382 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 383 try { 384 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 385 nzLower=n+ai[n]-ai[1]; 386 if (!loTriFactor) { 387 PetscScalar *AALo; 388 389 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 390 391 /* Allocate Space for the lower triangular matrix */ 392 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 393 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 394 395 /* Fill the lower triangular matrix */ 396 AiLo[0] = (PetscInt) 0; 397 AiLo[n] = nzLower; 398 AjLo[0] = (PetscInt) 0; 399 AALo[0] = (MatScalar) 1.0; 400 v = aa; 401 vi = aj; 402 offset = 1; 403 rowOffset= 1; 404 for (i=1; i<n; i++) { 405 nz = ai[i+1] - ai[i]; 406 /* additional 1 for the term on the diagonal */ 407 AiLo[i] = rowOffset; 408 rowOffset += nz+1; 409 410 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 411 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 412 413 offset += nz; 414 AjLo[offset] = (PetscInt) i; 415 AALo[offset] = (MatScalar) 1.0; 416 offset += 1; 417 418 v += nz; 419 vi += nz; 420 } 421 422 /* allocate space for the triangular factor information */ 423 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 424 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 425 /* Create the matrix description */ 426 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 427 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 428 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 429 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 430 #else 431 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 432 #endif 433 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 434 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 435 436 /* set the operation */ 437 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438 439 /* set the matrix */ 440 loTriFactor->csrMat = new CsrMatrix; 441 loTriFactor->csrMat->num_rows = n; 442 loTriFactor->csrMat->num_cols = n; 443 loTriFactor->csrMat->num_entries = nzLower; 444 445 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 447 448 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 449 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 450 451 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 452 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 453 454 /* Create the solve analysis information */ 455 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 456 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 457 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 458 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 459 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 460 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 461 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 462 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 463 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 464 #endif 465 466 /* perform the solve analysis */ 467 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 468 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 469 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 470 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 472 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 473 #endif 474 );CHKERRCUSPARSE(stat); 475 cerr = WaitForCUDA();CHKERRCUDA(cerr); 476 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 477 478 /* assign the pointer */ 479 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 480 loTriFactor->AA_h = AALo; 481 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 482 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 483 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484 } else { /* update values only */ 485 if (!loTriFactor->AA_h) { 486 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 487 } 488 /* Fill the lower triangular matrix */ 489 loTriFactor->AA_h[0] = 1.0; 490 v = aa; 491 vi = aj; 492 offset = 1; 493 for (i=1; i<n; i++) { 494 nz = ai[i+1] - ai[i]; 495 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 496 offset += nz; 497 loTriFactor->AA_h[offset] = 1.0; 498 offset += 1; 499 v += nz; 500 } 501 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 502 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 503 } 504 } catch(char *ex) { 505 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 506 } 507 } 508 PetscFunctionReturn(0); 509 } 510 511 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 512 { 513 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 514 PetscInt n = A->rmap->n; 515 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 516 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 517 cusparseStatus_t stat; 518 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 519 const MatScalar *aa = a->a,*v; 520 PetscInt *AiUp, *AjUp; 521 PetscInt i,nz, nzUpper, offset; 522 PetscErrorCode ierr; 523 cudaError_t cerr; 524 525 PetscFunctionBegin; 526 if (!n) PetscFunctionReturn(0); 527 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 528 try { 529 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 530 nzUpper = adiag[0]-adiag[n]; 531 if (!upTriFactor) { 532 PetscScalar *AAUp; 533 534 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 535 536 /* Allocate Space for the upper triangular matrix */ 537 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 538 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 539 540 /* Fill the upper triangular matrix */ 541 AiUp[0]=(PetscInt) 0; 542 AiUp[n]=nzUpper; 543 offset = nzUpper; 544 for (i=n-1; i>=0; i--) { 545 v = aa + adiag[i+1] + 1; 546 vi = aj + adiag[i+1] + 1; 547 548 /* number of elements NOT on the diagonal */ 549 nz = adiag[i] - adiag[i+1]-1; 550 551 /* decrement the offset */ 552 offset -= (nz+1); 553 554 /* first, set the diagonal elements */ 555 AjUp[offset] = (PetscInt) i; 556 AAUp[offset] = (MatScalar)1./v[nz]; 557 AiUp[i] = AiUp[i+1] - (nz+1); 558 559 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 560 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 561 } 562 563 /* allocate space for the triangular factor information */ 564 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 565 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 566 567 /* Create the matrix description */ 568 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 569 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 571 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 572 #else 573 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 574 #endif 575 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 576 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 577 578 /* set the operation */ 579 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 580 581 /* set the matrix */ 582 upTriFactor->csrMat = new CsrMatrix; 583 upTriFactor->csrMat->num_rows = n; 584 upTriFactor->csrMat->num_cols = n; 585 upTriFactor->csrMat->num_entries = nzUpper; 586 587 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 588 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 589 590 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 591 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 592 593 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 594 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 595 596 /* Create the solve analysis information */ 597 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 598 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 599 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 600 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 601 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 602 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 603 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 604 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 605 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 606 #endif 607 608 /* perform the solve analysis */ 609 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 614 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 615 #endif 616 );CHKERRCUSPARSE(stat); 617 cerr = WaitForCUDA();CHKERRCUDA(cerr); 618 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 619 620 /* assign the pointer */ 621 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 622 upTriFactor->AA_h = AAUp; 623 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 624 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 625 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 626 } else { 627 if (!upTriFactor->AA_h) { 628 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 629 } 630 /* Fill the upper triangular matrix */ 631 offset = nzUpper; 632 for (i=n-1; i>=0; i--) { 633 v = aa + adiag[i+1] + 1; 634 635 /* number of elements NOT on the diagonal */ 636 nz = adiag[i] - adiag[i+1]-1; 637 638 /* decrement the offset */ 639 offset -= (nz+1); 640 641 /* first, set the diagonal elements */ 642 upTriFactor->AA_h[offset] = 1./v[nz]; 643 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 644 } 645 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 646 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 647 } 648 } catch(char *ex) { 649 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 650 } 651 } 652 PetscFunctionReturn(0); 653 } 654 655 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 656 { 657 PetscErrorCode ierr; 658 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 659 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 660 IS isrow = a->row,iscol = a->icol; 661 PetscBool row_identity,col_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 666 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 667 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 668 669 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 670 cusparseTriFactors->nnz=a->nz; 671 672 A->offloadmask = PETSC_OFFLOAD_BOTH; 673 /* lower triangular indices */ 674 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 675 if (!row_identity && !cusparseTriFactors->rpermIndices) { 676 const PetscInt *r; 677 678 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 679 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 680 cusparseTriFactors->rpermIndices->assign(r, r+n); 681 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 682 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 683 } 684 685 /* upper triangular indices */ 686 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 687 if (!col_identity && !cusparseTriFactors->cpermIndices) { 688 const PetscInt *c; 689 690 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 691 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 692 cusparseTriFactors->cpermIndices->assign(c, c+n); 693 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 694 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 695 } 696 PetscFunctionReturn(0); 697 } 698 699 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 700 { 701 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 702 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 705 cusparseStatus_t stat; 706 PetscErrorCode ierr; 707 cudaError_t cerr; 708 PetscInt *AiUp, *AjUp; 709 PetscScalar *AAUp; 710 PetscScalar *AALo; 711 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 712 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 713 const PetscInt *ai = b->i,*aj = b->j,*vj; 714 const MatScalar *aa = b->a,*v; 715 716 PetscFunctionBegin; 717 if (!n) PetscFunctionReturn(0); 718 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 719 try { 720 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 721 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 722 if (!upTriFactor && !loTriFactor) { 723 /* Allocate Space for the upper triangular matrix */ 724 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 725 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 726 727 /* Fill the upper triangular matrix */ 728 AiUp[0]=(PetscInt) 0; 729 AiUp[n]=nzUpper; 730 offset = 0; 731 for (i=0; i<n; i++) { 732 /* set the pointers */ 733 v = aa + ai[i]; 734 vj = aj + ai[i]; 735 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 736 737 /* first, set the diagonal elements */ 738 AjUp[offset] = (PetscInt) i; 739 AAUp[offset] = (MatScalar)1.0/v[nz]; 740 AiUp[i] = offset; 741 AALo[offset] = (MatScalar)1.0/v[nz]; 742 743 offset+=1; 744 if (nz>0) { 745 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 746 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 747 for (j=offset; j<offset+nz; j++) { 748 AAUp[j] = -AAUp[j]; 749 AALo[j] = AAUp[j]/v[nz]; 750 } 751 offset+=nz; 752 } 753 } 754 755 /* allocate space for the triangular factor information */ 756 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 757 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 758 759 /* Create the matrix description */ 760 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 761 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 762 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 763 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 764 #else 765 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 766 #endif 767 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 768 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 769 770 /* set the matrix */ 771 upTriFactor->csrMat = new CsrMatrix; 772 upTriFactor->csrMat->num_rows = A->rmap->n; 773 upTriFactor->csrMat->num_cols = A->cmap->n; 774 upTriFactor->csrMat->num_entries = a->nz; 775 776 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 777 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 778 779 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 780 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 781 782 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 783 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 784 785 /* set the operation */ 786 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 787 788 /* Create the solve analysis information */ 789 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 790 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 791 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 792 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 793 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 794 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 795 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 796 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 797 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 798 #endif 799 800 /* perform the solve analysis */ 801 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 802 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 803 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 804 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 805 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 806 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 807 #endif 808 );CHKERRCUSPARSE(stat); 809 cerr = WaitForCUDA();CHKERRCUDA(cerr); 810 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 811 812 /* assign the pointer */ 813 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 814 815 /* allocate space for the triangular factor information */ 816 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 817 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 818 819 /* Create the matrix description */ 820 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 821 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 824 #else 825 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 826 #endif 827 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 828 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 829 830 /* set the operation */ 831 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 832 833 /* set the matrix */ 834 loTriFactor->csrMat = new CsrMatrix; 835 loTriFactor->csrMat->num_rows = A->rmap->n; 836 loTriFactor->csrMat->num_cols = A->cmap->n; 837 loTriFactor->csrMat->num_entries = a->nz; 838 839 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841 842 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844 845 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 847 848 /* Create the solve analysis information */ 849 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 850 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 851 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 852 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 853 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 854 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 855 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 856 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 857 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 858 #endif 859 860 /* perform the solve analysis */ 861 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 862 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 863 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 864 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 865 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 866 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 867 #endif 868 );CHKERRCUSPARSE(stat); 869 cerr = WaitForCUDA();CHKERRCUDA(cerr); 870 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 871 872 /* assign the pointer */ 873 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 874 875 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 876 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 877 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 878 } else { 879 /* Fill the upper triangular matrix */ 880 offset = 0; 881 for (i=0; i<n; i++) { 882 /* set the pointers */ 883 v = aa + ai[i]; 884 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 885 886 /* first, set the diagonal elements */ 887 AAUp[offset] = 1.0/v[nz]; 888 AALo[offset] = 1.0/v[nz]; 889 890 offset+=1; 891 if (nz>0) { 892 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 893 for (j=offset; j<offset+nz; j++) { 894 AAUp[j] = -AAUp[j]; 895 AALo[j] = AAUp[j]/v[nz]; 896 } 897 offset+=nz; 898 } 899 } 900 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 901 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 902 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 903 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 904 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 905 } 906 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 907 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 908 } catch(char *ex) { 909 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 910 } 911 } 912 PetscFunctionReturn(0); 913 } 914 915 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 916 { 917 PetscErrorCode ierr; 918 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 919 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 920 IS ip = a->row; 921 PetscBool perm_identity; 922 PetscInt n = A->rmap->n; 923 924 PetscFunctionBegin; 925 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 926 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 927 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 928 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 929 930 A->offloadmask = PETSC_OFFLOAD_BOTH; 931 932 /* lower triangular indices */ 933 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 934 if (!perm_identity) { 935 IS iip; 936 const PetscInt *irip,*rip; 937 938 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 939 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 940 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 941 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 942 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 943 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 944 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 945 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 946 ierr = ISDestroy(&iip);CHKERRQ(ierr); 947 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 948 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 949 } 950 PetscFunctionReturn(0); 951 } 952 953 #define CHECK_LAUNCH_ERROR() \ 954 do { \ 955 /* Check synchronous errors, i.e. pre-launch */ \ 956 cudaError_t err = cudaGetLastError(); \ 957 if (cudaSuccess != err) { \ 958 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \ 959 } \ 960 /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 961 err = cudaDeviceSynchronize(); \ 962 if (cudaSuccess != err) { \ 963 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \ 964 } \ 965 } while (0) 966 967 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 968 { 969 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 970 IS ip = b->row; 971 PetscBool perm_identity; 972 PetscErrorCode ierr; 973 974 PetscFunctionBegin; 975 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 976 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 977 B->offloadmask = PETSC_OFFLOAD_CPU; 978 /* determine which version of MatSolve needs to be used. */ 979 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 980 if (perm_identity) { 981 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 982 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 983 B->ops->matsolve = NULL; 984 B->ops->matsolvetranspose = NULL; 985 } else { 986 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 987 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 988 B->ops->matsolve = NULL; 989 B->ops->matsolvetranspose = NULL; 990 } 991 992 /* get the triangular factors */ 993 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 998 { 999 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1001 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1002 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1003 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1004 cusparseStatus_t stat; 1005 cusparseIndexBase_t indexBase; 1006 cusparseMatrixType_t matrixType; 1007 cusparseFillMode_t fillMode; 1008 cusparseDiagType_t diagType; 1009 cudaError_t cerr; 1010 PetscErrorCode ierr; 1011 1012 PetscFunctionBegin; 1013 /* allocate space for the transpose of the lower triangular factor */ 1014 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1015 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1016 1017 /* set the matrix descriptors of the lower triangular factor */ 1018 matrixType = cusparseGetMatType(loTriFactor->descr); 1019 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1020 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1021 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1022 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1023 1024 /* Create the matrix description */ 1025 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1026 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1027 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1028 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1029 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1030 1031 /* set the operation */ 1032 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1033 1034 /* allocate GPU space for the CSC of the lower triangular factor*/ 1035 loTriFactorT->csrMat = new CsrMatrix; 1036 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1037 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1038 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1039 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1040 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1041 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1042 1043 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1045 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1046 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1047 loTriFactor->csrMat->values->data().get(), 1048 loTriFactor->csrMat->row_offsets->data().get(), 1049 loTriFactor->csrMat->column_indices->data().get(), 1050 loTriFactorT->csrMat->values->data().get(), 1051 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1052 CUSPARSE_ACTION_NUMERIC,indexBase, 1053 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1054 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1055 #endif 1056 1057 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1058 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1059 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1060 loTriFactor->csrMat->values->data().get(), 1061 loTriFactor->csrMat->row_offsets->data().get(), 1062 loTriFactor->csrMat->column_indices->data().get(), 1063 loTriFactorT->csrMat->values->data().get(), 1064 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1065 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1066 CUSPARSE_ACTION_NUMERIC, indexBase, 1067 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1068 #else 1069 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1070 CUSPARSE_ACTION_NUMERIC, indexBase 1071 #endif 1072 );CHKERRCUSPARSE(stat); 1073 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1074 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1075 1076 /* Create the solve analysis information */ 1077 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1078 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1079 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1080 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1081 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1082 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1083 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1084 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1085 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1086 #endif 1087 1088 /* perform the solve analysis */ 1089 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1091 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1092 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 1093 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1094 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1095 #endif 1096 );CHKERRCUSPARSE(stat); 1097 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1098 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1099 1100 /* assign the pointer */ 1101 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1102 1103 /*********************************************/ 1104 /* Now the Transpose of the Upper Tri Factor */ 1105 /*********************************************/ 1106 1107 /* allocate space for the transpose of the upper triangular factor */ 1108 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1109 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1110 1111 /* set the matrix descriptors of the upper triangular factor */ 1112 matrixType = cusparseGetMatType(upTriFactor->descr); 1113 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1114 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1115 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1116 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1117 1118 /* Create the matrix description */ 1119 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1121 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1122 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1123 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1124 1125 /* set the operation */ 1126 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1127 1128 /* allocate GPU space for the CSC of the upper triangular factor*/ 1129 upTriFactorT->csrMat = new CsrMatrix; 1130 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1131 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1132 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1133 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1134 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1135 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1136 1137 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1138 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1139 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1140 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1141 upTriFactor->csrMat->values->data().get(), 1142 upTriFactor->csrMat->row_offsets->data().get(), 1143 upTriFactor->csrMat->column_indices->data().get(), 1144 upTriFactorT->csrMat->values->data().get(), 1145 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1146 CUSPARSE_ACTION_NUMERIC,indexBase, 1147 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1148 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1149 #endif 1150 1151 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1152 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1153 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1154 upTriFactor->csrMat->values->data().get(), 1155 upTriFactor->csrMat->row_offsets->data().get(), 1156 upTriFactor->csrMat->column_indices->data().get(), 1157 upTriFactorT->csrMat->values->data().get(), 1158 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1159 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1160 CUSPARSE_ACTION_NUMERIC, indexBase, 1161 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1162 #else 1163 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1164 CUSPARSE_ACTION_NUMERIC, indexBase 1165 #endif 1166 );CHKERRCUSPARSE(stat); 1167 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1168 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1169 1170 /* Create the solve analysis information */ 1171 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1172 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1173 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1174 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1175 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1176 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1177 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1178 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1179 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1180 #endif 1181 1182 /* perform the solve analysis */ 1183 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1184 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1185 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1186 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1187 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1188 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1189 #endif 1190 );CHKERRCUSPARSE(stat); 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 struct PetscScalarToPetscInt 1200 { 1201 __host__ __device__ 1202 PetscInt operator()(PetscScalar s) 1203 { 1204 return (PetscInt)PetscRealPart(s); 1205 } 1206 }; 1207 1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A) 1209 { 1210 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213 cusparseStatus_t stat; 1214 cusparseIndexBase_t indexBase; 1215 cudaError_t err; 1216 PetscErrorCode ierr; 1217 1218 PetscFunctionBegin; 1219 if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 1220 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1221 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1222 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 1223 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1224 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct"); 1225 if (A->transupdated) PetscFunctionReturn(0); 1226 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1227 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1228 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1229 } 1230 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1231 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1232 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1233 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1234 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1235 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1236 1237 /* set alpha and beta */ 1238 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1240 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1243 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244 1245 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1246 CsrMatrix *matrixT = new CsrMatrix; 1247 matstructT->mat = matrixT; 1248 matrixT->num_rows = A->cmap->n; 1249 matrixT->num_cols = A->rmap->n; 1250 matrixT->num_entries = a->nz; 1251 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1252 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1253 matrixT->values = new THRUSTARRAY(a->nz); 1254 1255 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1256 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1257 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 stat = cusparseCreateCsr(&matstructT->matDescr, 1260 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1261 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1262 matrixT->values->data().get(), 1263 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1264 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1265 #endif 1266 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1267 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1268 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1269 #else 1270 CsrMatrix *temp = new CsrMatrix; 1271 CsrMatrix *tempT = new CsrMatrix; 1272 /* First convert HYB to CSR */ 1273 temp->num_rows = A->rmap->n; 1274 temp->num_cols = A->cmap->n; 1275 temp->num_entries = a->nz; 1276 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1277 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1278 temp->values = new THRUSTARRAY(a->nz); 1279 1280 stat = cusparse_hyb2csr(cusparsestruct->handle, 1281 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1282 temp->values->data().get(), 1283 temp->row_offsets->data().get(), 1284 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1285 1286 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1287 tempT->num_rows = A->rmap->n; 1288 tempT->num_cols = A->cmap->n; 1289 tempT->num_entries = a->nz; 1290 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1292 tempT->values = new THRUSTARRAY(a->nz); 1293 1294 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1295 temp->num_cols, temp->num_entries, 1296 temp->values->data().get(), 1297 temp->row_offsets->data().get(), 1298 temp->column_indices->data().get(), 1299 tempT->values->data().get(), 1300 tempT->column_indices->data().get(), 1301 tempT->row_offsets->data().get(), 1302 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1303 1304 /* Last, convert CSC to HYB */ 1305 cusparseHybMat_t hybMat; 1306 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1307 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1308 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1309 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1310 matstructT->descr, tempT->values->data().get(), 1311 tempT->row_offsets->data().get(), 1312 tempT->column_indices->data().get(), 1313 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1314 1315 /* assign the pointer */ 1316 matstructT->mat = hybMat; 1317 A->transupdated = PETSC_TRUE; 1318 /* delete temporaries */ 1319 if (tempT) { 1320 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1321 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1322 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1323 delete (CsrMatrix*) tempT; 1324 } 1325 if (temp) { 1326 if (temp->values) delete (THRUSTARRAY*) temp->values; 1327 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1328 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1329 delete (CsrMatrix*) temp; 1330 } 1331 #endif 1332 } 1333 } 1334 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1335 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1336 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1337 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix"); 1338 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows"); 1339 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols"); 1340 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values"); 1341 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT"); 1342 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows"); 1343 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols"); 1344 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values"); 1345 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1346 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1347 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1348 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1349 } 1350 if (!cusparsestruct->csr2csc_i) { 1351 THRUSTARRAY csr2csc_a(matrix->num_entries); 1352 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1353 1354 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1355 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1356 void *csr2cscBuffer; 1357 size_t csr2cscBufferSize; 1358 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1359 A->cmap->n, matrix->num_entries, 1360 matrix->values->data().get(), 1361 cusparsestruct->rowoffsets_gpu->data().get(), 1362 matrix->column_indices->data().get(), 1363 matrixT->values->data().get(), 1364 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1365 CUSPARSE_ACTION_NUMERIC,indexBase, 1366 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1367 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1368 #endif 1369 1370 if (matrix->num_entries) { 1371 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1372 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1373 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1374 1375 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1376 should be filled with indexBase. So I just take a shortcut here. 1377 */ 1378 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1379 A->cmap->n,matrix->num_entries, 1380 csr2csc_a.data().get(), 1381 cusparsestruct->rowoffsets_gpu->data().get(), 1382 matrix->column_indices->data().get(), 1383 matrixT->values->data().get(), 1384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1385 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1386 CUSPARSE_ACTION_NUMERIC,indexBase, 1387 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1388 #else 1389 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1390 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1391 #endif 1392 } else { 1393 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1394 } 1395 1396 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1397 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1398 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1399 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1400 #endif 1401 } 1402 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1403 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1404 matrixT->values->begin())); 1405 } 1406 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1407 /* the compressed row indices is not used for matTranspose */ 1408 matstructT->cprowIndices = NULL; 1409 /* assign the pointer */ 1410 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1411 A->transupdated = PETSC_TRUE; 1412 PetscFunctionReturn(0); 1413 } 1414 1415 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1416 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1417 { 1418 PetscInt n = xx->map->n; 1419 const PetscScalar *barray; 1420 PetscScalar *xarray; 1421 thrust::device_ptr<const PetscScalar> bGPU; 1422 thrust::device_ptr<PetscScalar> xGPU; 1423 cusparseStatus_t stat; 1424 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1425 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1426 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1427 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1428 PetscErrorCode ierr; 1429 cudaError_t cerr; 1430 1431 PetscFunctionBegin; 1432 /* Analyze the matrix and create the transpose ... on the fly */ 1433 if (!loTriFactorT && !upTriFactorT) { 1434 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1435 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1436 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1437 } 1438 1439 /* Get the GPU pointers */ 1440 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1441 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1442 xGPU = thrust::device_pointer_cast(xarray); 1443 bGPU = thrust::device_pointer_cast(barray); 1444 1445 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1446 /* First, reorder with the row permutation */ 1447 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1448 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1449 xGPU); 1450 1451 /* First, solve U */ 1452 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1453 upTriFactorT->csrMat->num_rows, 1454 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1455 upTriFactorT->csrMat->num_entries, 1456 #endif 1457 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1458 upTriFactorT->csrMat->values->data().get(), 1459 upTriFactorT->csrMat->row_offsets->data().get(), 1460 upTriFactorT->csrMat->column_indices->data().get(), 1461 upTriFactorT->solveInfo, 1462 xarray, tempGPU->data().get() 1463 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1464 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1465 #endif 1466 );CHKERRCUSPARSE(stat); 1467 1468 /* Then, solve L */ 1469 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1470 loTriFactorT->csrMat->num_rows, 1471 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472 loTriFactorT->csrMat->num_entries, 1473 #endif 1474 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1475 loTriFactorT->csrMat->values->data().get(), 1476 loTriFactorT->csrMat->row_offsets->data().get(), 1477 loTriFactorT->csrMat->column_indices->data().get(), 1478 loTriFactorT->solveInfo, 1479 tempGPU->data().get(), xarray 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1482 #endif 1483 );CHKERRCUSPARSE(stat); 1484 1485 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1486 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1487 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1488 tempGPU->begin()); 1489 1490 /* Copy the temporary to the full solution. */ 1491 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1492 1493 /* restore */ 1494 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1495 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1496 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1497 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1498 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1499 PetscFunctionReturn(0); 1500 } 1501 1502 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1503 { 1504 const PetscScalar *barray; 1505 PetscScalar *xarray; 1506 cusparseStatus_t stat; 1507 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1508 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1509 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1510 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1511 PetscErrorCode ierr; 1512 cudaError_t cerr; 1513 1514 PetscFunctionBegin; 1515 /* Analyze the matrix and create the transpose ... on the fly */ 1516 if (!loTriFactorT && !upTriFactorT) { 1517 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1518 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520 } 1521 1522 /* Get the GPU pointers */ 1523 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1525 1526 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1527 /* First, solve U */ 1528 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1529 upTriFactorT->csrMat->num_rows, 1530 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1531 upTriFactorT->csrMat->num_entries, 1532 #endif 1533 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1534 upTriFactorT->csrMat->values->data().get(), 1535 upTriFactorT->csrMat->row_offsets->data().get(), 1536 upTriFactorT->csrMat->column_indices->data().get(), 1537 upTriFactorT->solveInfo, 1538 barray, tempGPU->data().get() 1539 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1540 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1541 #endif 1542 );CHKERRCUSPARSE(stat); 1543 1544 /* Then, solve L */ 1545 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1546 loTriFactorT->csrMat->num_rows, 1547 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1548 loTriFactorT->csrMat->num_entries, 1549 #endif 1550 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1551 loTriFactorT->csrMat->values->data().get(), 1552 loTriFactorT->csrMat->row_offsets->data().get(), 1553 loTriFactorT->csrMat->column_indices->data().get(), 1554 loTriFactorT->solveInfo, 1555 tempGPU->data().get(), xarray 1556 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1557 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1558 #endif 1559 );CHKERRCUSPARSE(stat); 1560 1561 /* restore */ 1562 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1563 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1564 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1565 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1566 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1567 PetscFunctionReturn(0); 1568 } 1569 1570 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1571 { 1572 const PetscScalar *barray; 1573 PetscScalar *xarray; 1574 thrust::device_ptr<const PetscScalar> bGPU; 1575 thrust::device_ptr<PetscScalar> xGPU; 1576 cusparseStatus_t stat; 1577 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1578 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1579 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1580 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1581 PetscErrorCode ierr; 1582 cudaError_t cerr; 1583 1584 PetscFunctionBegin; 1585 1586 /* Get the GPU pointers */ 1587 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1588 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1589 xGPU = thrust::device_pointer_cast(xarray); 1590 bGPU = thrust::device_pointer_cast(barray); 1591 1592 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1593 /* First, reorder with the row permutation */ 1594 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1595 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1596 tempGPU->begin()); 1597 1598 /* Next, solve L */ 1599 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1600 loTriFactor->csrMat->num_rows, 1601 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1602 loTriFactor->csrMat->num_entries, 1603 #endif 1604 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1605 loTriFactor->csrMat->values->data().get(), 1606 loTriFactor->csrMat->row_offsets->data().get(), 1607 loTriFactor->csrMat->column_indices->data().get(), 1608 loTriFactor->solveInfo, 1609 tempGPU->data().get(), xarray 1610 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1612 #endif 1613 );CHKERRCUSPARSE(stat); 1614 1615 /* Then, solve U */ 1616 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1617 upTriFactor->csrMat->num_rows, 1618 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1619 upTriFactor->csrMat->num_entries, 1620 #endif 1621 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1622 upTriFactor->csrMat->values->data().get(), 1623 upTriFactor->csrMat->row_offsets->data().get(), 1624 upTriFactor->csrMat->column_indices->data().get(), 1625 upTriFactor->solveInfo, 1626 xarray, tempGPU->data().get() 1627 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1629 #endif 1630 );CHKERRCUSPARSE(stat); 1631 1632 /* Last, reorder with the column permutation */ 1633 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1634 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1635 xGPU); 1636 1637 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1638 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1639 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1640 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1641 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1642 PetscFunctionReturn(0); 1643 } 1644 1645 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1646 { 1647 const PetscScalar *barray; 1648 PetscScalar *xarray; 1649 cusparseStatus_t stat; 1650 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1651 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1652 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1653 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1654 PetscErrorCode ierr; 1655 cudaError_t cerr; 1656 1657 PetscFunctionBegin; 1658 /* Get the GPU pointers */ 1659 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1660 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1661 1662 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1663 /* First, solve L */ 1664 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1665 loTriFactor->csrMat->num_rows, 1666 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1667 loTriFactor->csrMat->num_entries, 1668 #endif 1669 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1670 loTriFactor->csrMat->values->data().get(), 1671 loTriFactor->csrMat->row_offsets->data().get(), 1672 loTriFactor->csrMat->column_indices->data().get(), 1673 loTriFactor->solveInfo, 1674 barray, tempGPU->data().get() 1675 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1676 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1677 #endif 1678 );CHKERRCUSPARSE(stat); 1679 1680 /* Next, solve U */ 1681 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1682 upTriFactor->csrMat->num_rows, 1683 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1684 upTriFactor->csrMat->num_entries, 1685 #endif 1686 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1687 upTriFactor->csrMat->values->data().get(), 1688 upTriFactor->csrMat->row_offsets->data().get(), 1689 upTriFactor->csrMat->column_indices->data().get(), 1690 upTriFactor->solveInfo, 1691 tempGPU->data().get(), xarray 1692 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1693 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1694 #endif 1695 );CHKERRCUSPARSE(stat); 1696 1697 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1698 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1699 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1700 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1701 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704 1705 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1706 { 1707 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1708 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1709 cudaError_t cerr; 1710 PetscErrorCode ierr; 1711 1712 PetscFunctionBegin; 1713 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1714 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1715 1716 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1717 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1718 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1719 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1720 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1721 A->offloadmask = PETSC_OFFLOAD_BOTH; 1722 } 1723 PetscFunctionReturn(0); 1724 } 1725 1726 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1727 { 1728 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1729 PetscErrorCode ierr; 1730 1731 PetscFunctionBegin; 1732 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1733 *array = a->a; 1734 A->offloadmask = PETSC_OFFLOAD_CPU; 1735 PetscFunctionReturn(0); 1736 } 1737 1738 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1739 { 1740 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1741 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1742 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1743 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1744 PetscErrorCode ierr; 1745 cusparseStatus_t stat; 1746 PetscBool both = PETSC_TRUE; 1747 cudaError_t err; 1748 1749 PetscFunctionBegin; 1750 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1751 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1752 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1753 CsrMatrix *matrix; 1754 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1755 1756 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values"); 1757 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1758 matrix->values->assign(a->a, a->a+a->nz); 1759 err = WaitForCUDA();CHKERRCUDA(err); 1760 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1761 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1762 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1763 } else { 1764 PetscInt nnz; 1765 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1766 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1767 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1768 delete cusparsestruct->workVector; 1769 delete cusparsestruct->rowoffsets_gpu; 1770 cusparsestruct->workVector = NULL; 1771 cusparsestruct->rowoffsets_gpu = NULL; 1772 try { 1773 if (a->compressedrow.use) { 1774 m = a->compressedrow.nrows; 1775 ii = a->compressedrow.i; 1776 ridx = a->compressedrow.rindex; 1777 } else { 1778 m = A->rmap->n; 1779 ii = a->i; 1780 ridx = NULL; 1781 } 1782 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data"); 1783 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data"); 1784 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1785 else nnz = a->nz; 1786 1787 /* create cusparse matrix */ 1788 cusparsestruct->nrows = m; 1789 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1790 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1791 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1792 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1793 1794 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1795 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1796 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1797 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1798 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1799 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1800 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1801 1802 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1803 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1804 /* set the matrix */ 1805 CsrMatrix *mat= new CsrMatrix; 1806 mat->num_rows = m; 1807 mat->num_cols = A->cmap->n; 1808 mat->num_entries = nnz; 1809 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1810 mat->row_offsets->assign(ii, ii + m+1); 1811 1812 mat->column_indices = new THRUSTINTARRAY32(nnz); 1813 mat->column_indices->assign(a->j, a->j+nnz); 1814 1815 mat->values = new THRUSTARRAY(nnz); 1816 if (a->a) mat->values->assign(a->a, a->a+nnz); 1817 1818 /* assign the pointer */ 1819 matstruct->mat = mat; 1820 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1821 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1822 stat = cusparseCreateCsr(&matstruct->matDescr, 1823 mat->num_rows, mat->num_cols, mat->num_entries, 1824 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1825 mat->values->data().get(), 1826 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1827 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1828 } 1829 #endif 1830 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1831 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1832 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1833 #else 1834 CsrMatrix *mat= new CsrMatrix; 1835 mat->num_rows = m; 1836 mat->num_cols = A->cmap->n; 1837 mat->num_entries = nnz; 1838 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1839 mat->row_offsets->assign(ii, ii + m+1); 1840 1841 mat->column_indices = new THRUSTINTARRAY32(nnz); 1842 mat->column_indices->assign(a->j, a->j+nnz); 1843 1844 mat->values = new THRUSTARRAY(nnz); 1845 if (a->a) mat->values->assign(a->a, a->a+nnz); 1846 1847 cusparseHybMat_t hybMat; 1848 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1849 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1850 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1851 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1852 matstruct->descr, mat->values->data().get(), 1853 mat->row_offsets->data().get(), 1854 mat->column_indices->data().get(), 1855 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1856 /* assign the pointer */ 1857 matstruct->mat = hybMat; 1858 1859 if (mat) { 1860 if (mat->values) delete (THRUSTARRAY*)mat->values; 1861 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1862 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1863 delete (CsrMatrix*)mat; 1864 } 1865 #endif 1866 } 1867 1868 /* assign the compressed row indices */ 1869 if (a->compressedrow.use) { 1870 cusparsestruct->workVector = new THRUSTARRAY(m); 1871 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1872 matstruct->cprowIndices->assign(ridx,ridx+m); 1873 tmp = m; 1874 } else { 1875 cusparsestruct->workVector = NULL; 1876 matstruct->cprowIndices = NULL; 1877 tmp = 0; 1878 } 1879 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1880 1881 /* assign the pointer */ 1882 cusparsestruct->mat = matstruct; 1883 } catch(char *ex) { 1884 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1885 } 1886 err = WaitForCUDA();CHKERRCUDA(err); 1887 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1888 cusparsestruct->nonzerostate = A->nonzerostate; 1889 } 1890 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1891 } 1892 PetscFunctionReturn(0); 1893 } 1894 1895 struct VecCUDAPlusEquals 1896 { 1897 template <typename Tuple> 1898 __host__ __device__ 1899 void operator()(Tuple t) 1900 { 1901 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1902 } 1903 }; 1904 1905 struct VecCUDAEquals 1906 { 1907 template <typename Tuple> 1908 __host__ __device__ 1909 void operator()(Tuple t) 1910 { 1911 thrust::get<1>(t) = thrust::get<0>(t); 1912 } 1913 }; 1914 1915 struct VecCUDAEqualsReverse 1916 { 1917 template <typename Tuple> 1918 __host__ __device__ 1919 void operator()(Tuple t) 1920 { 1921 thrust::get<0>(t) = thrust::get<1>(t); 1922 } 1923 }; 1924 1925 struct MatMatCusparse { 1926 PetscBool cisdense; 1927 PetscScalar *Bt; 1928 Mat X; 1929 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1930 PetscLogDouble flops; 1931 CsrMatrix *Bcsr; 1932 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1933 cusparseSpMatDescr_t matSpBDescr; 1934 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1935 cusparseDnMatDescr_t matBDescr; 1936 cusparseDnMatDescr_t matCDescr; 1937 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1938 size_t mmBufferSize; 1939 void *mmBuffer; 1940 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1941 cusparseSpGEMMDescr_t spgemmDesc; 1942 #endif 1943 }; 1944 1945 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1946 { 1947 PetscErrorCode ierr; 1948 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1949 cudaError_t cerr; 1950 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1951 cusparseStatus_t stat; 1952 #endif 1953 1954 PetscFunctionBegin; 1955 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1956 delete mmdata->Bcsr; 1957 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1958 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1959 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1960 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1961 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1962 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1963 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1964 #endif 1965 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1966 ierr = PetscFree(data);CHKERRQ(ierr); 1967 PetscFunctionReturn(0); 1968 } 1969 1970 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1971 1972 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1973 { 1974 Mat_Product *product = C->product; 1975 Mat A,B; 1976 PetscInt m,n,blda,clda; 1977 PetscBool flg,biscuda; 1978 Mat_SeqAIJCUSPARSE *cusp; 1979 cusparseStatus_t stat; 1980 cusparseOperation_t opA; 1981 const PetscScalar *barray; 1982 PetscScalar *carray; 1983 PetscErrorCode ierr; 1984 MatMatCusparse *mmdata; 1985 Mat_SeqAIJCUSPARSEMultStruct *mat; 1986 CsrMatrix *csrmat; 1987 cudaError_t cerr; 1988 1989 PetscFunctionBegin; 1990 MatCheckProduct(C,1); 1991 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1992 mmdata = (MatMatCusparse*)product->data; 1993 A = product->A; 1994 B = product->B; 1995 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1996 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1997 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1998 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1999 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2000 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2001 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2002 switch (product->type) { 2003 case MATPRODUCT_AB: 2004 case MATPRODUCT_PtAP: 2005 mat = cusp->mat; 2006 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2007 m = A->rmap->n; 2008 n = B->cmap->n; 2009 break; 2010 case MATPRODUCT_AtB: 2011 if (!A->form_explicit_transpose) { 2012 mat = cusp->mat; 2013 opA = CUSPARSE_OPERATION_TRANSPOSE; 2014 } else { 2015 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2016 mat = cusp->matTranspose; 2017 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2018 } 2019 m = A->cmap->n; 2020 n = B->cmap->n; 2021 break; 2022 case MATPRODUCT_ABt: 2023 case MATPRODUCT_RARt: 2024 mat = cusp->mat; 2025 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2026 m = A->rmap->n; 2027 n = B->rmap->n; 2028 break; 2029 default: 2030 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2031 } 2032 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2033 csrmat = (CsrMatrix*)mat->mat; 2034 /* if the user passed a CPU matrix, copy the data to the GPU */ 2035 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2036 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2037 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2038 2039 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2040 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2041 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2042 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2043 } else { 2044 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2045 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2046 } 2047 2048 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2049 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2050 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2051 /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2052 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2053 size_t mmBufferSize; 2054 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2055 if (!mmdata->matBDescr) { 2056 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2057 mmdata->Blda = blda; 2058 } 2059 2060 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2061 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2062 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2063 mmdata->Clda = clda; 2064 } 2065 2066 if (!mat->matDescr) { 2067 stat = cusparseCreateCsr(&mat->matDescr, 2068 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2069 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2070 csrmat->values->data().get(), 2071 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2072 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2073 } 2074 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2075 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2076 mmdata->matCDescr,cusparse_scalartype, 2077 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2078 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2079 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2080 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2081 mmdata->mmBufferSize = mmBufferSize; 2082 } 2083 mmdata->initialized = PETSC_TRUE; 2084 } else { 2085 /* to be safe, always update pointers of the mats */ 2086 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2087 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2088 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2089 } 2090 2091 /* do cusparseSpMM, which supports transpose on B */ 2092 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2093 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2094 mmdata->matCDescr,cusparse_scalartype, 2095 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2096 #else 2097 PetscInt k; 2098 /* cusparseXcsrmm does not support transpose on B */ 2099 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2100 cublasHandle_t cublasv2handle; 2101 cublasStatus_t cerr; 2102 2103 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2104 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2105 B->cmap->n,B->rmap->n, 2106 &PETSC_CUSPARSE_ONE ,barray,blda, 2107 &PETSC_CUSPARSE_ZERO,barray,blda, 2108 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2109 blda = B->cmap->n; 2110 k = B->cmap->n; 2111 } else { 2112 k = B->rmap->n; 2113 } 2114 2115 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2116 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2117 csrmat->num_entries,mat->alpha_one,mat->descr, 2118 csrmat->values->data().get(), 2119 csrmat->row_offsets->data().get(), 2120 csrmat->column_indices->data().get(), 2121 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2122 carray,clda);CHKERRCUSPARSE(stat); 2123 #endif 2124 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2125 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2126 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2127 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2128 if (product->type == MATPRODUCT_RARt) { 2129 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2130 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2131 } else if (product->type == MATPRODUCT_PtAP) { 2132 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2133 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2134 } else { 2135 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2136 } 2137 if (mmdata->cisdense) { 2138 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2139 } 2140 if (!biscuda) { 2141 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2142 } 2143 PetscFunctionReturn(0); 2144 } 2145 2146 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2147 { 2148 Mat_Product *product = C->product; 2149 Mat A,B; 2150 PetscInt m,n; 2151 PetscBool cisdense,flg; 2152 PetscErrorCode ierr; 2153 MatMatCusparse *mmdata; 2154 Mat_SeqAIJCUSPARSE *cusp; 2155 2156 PetscFunctionBegin; 2157 MatCheckProduct(C,1); 2158 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2159 A = product->A; 2160 B = product->B; 2161 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2162 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2163 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2164 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2165 switch (product->type) { 2166 case MATPRODUCT_AB: 2167 m = A->rmap->n; 2168 n = B->cmap->n; 2169 break; 2170 case MATPRODUCT_AtB: 2171 m = A->cmap->n; 2172 n = B->cmap->n; 2173 break; 2174 case MATPRODUCT_ABt: 2175 m = A->rmap->n; 2176 n = B->rmap->n; 2177 break; 2178 case MATPRODUCT_PtAP: 2179 m = B->cmap->n; 2180 n = B->cmap->n; 2181 break; 2182 case MATPRODUCT_RARt: 2183 m = B->rmap->n; 2184 n = B->rmap->n; 2185 break; 2186 default: 2187 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2188 } 2189 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2190 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2191 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2192 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2193 2194 /* product data */ 2195 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2196 mmdata->cisdense = cisdense; 2197 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2198 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2199 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2200 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2201 } 2202 #endif 2203 /* for these products we need intermediate storage */ 2204 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2205 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2206 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2207 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2208 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2209 } else { 2210 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2211 } 2212 } 2213 C->product->data = mmdata; 2214 C->product->destroy = MatDestroy_MatMatCusparse; 2215 2216 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2217 PetscFunctionReturn(0); 2218 } 2219 2220 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2221 { 2222 Mat_Product *product = C->product; 2223 Mat A,B; 2224 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2225 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2226 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2227 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2228 PetscBool flg; 2229 PetscErrorCode ierr; 2230 cusparseStatus_t stat; 2231 cudaError_t cerr; 2232 MatProductType ptype; 2233 MatMatCusparse *mmdata; 2234 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2235 cusparseSpMatDescr_t BmatSpDescr; 2236 #endif 2237 2238 PetscFunctionBegin; 2239 MatCheckProduct(C,1); 2240 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2241 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2242 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2243 mmdata = (MatMatCusparse*)C->product->data; 2244 A = product->A; 2245 B = product->B; 2246 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2247 mmdata->reusesym = PETSC_FALSE; 2248 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2249 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2250 Cmat = Ccusp->mat; 2251 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2252 Ccsr = (CsrMatrix*)Cmat->mat; 2253 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2254 goto finalize; 2255 } 2256 if (!c->nz) goto finalize; 2257 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2258 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2259 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2260 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2261 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2262 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2263 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2264 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2265 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2266 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2267 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2268 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2269 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2270 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2271 2272 ptype = product->type; 2273 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2274 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2275 switch (ptype) { 2276 case MATPRODUCT_AB: 2277 Amat = Acusp->mat; 2278 Bmat = Bcusp->mat; 2279 break; 2280 case MATPRODUCT_AtB: 2281 Amat = Acusp->matTranspose; 2282 Bmat = Bcusp->mat; 2283 break; 2284 case MATPRODUCT_ABt: 2285 Amat = Acusp->mat; 2286 Bmat = Bcusp->matTranspose; 2287 break; 2288 default: 2289 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2290 } 2291 Cmat = Ccusp->mat; 2292 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2293 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2294 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2295 Acsr = (CsrMatrix*)Amat->mat; 2296 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2297 Ccsr = (CsrMatrix*)Cmat->mat; 2298 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2299 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2300 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2301 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2302 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2303 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2304 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2305 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2306 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2307 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2308 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2309 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2310 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2311 #else 2312 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2313 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2314 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2315 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2316 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2317 #endif 2318 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2319 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2320 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2321 C->offloadmask = PETSC_OFFLOAD_GPU; 2322 finalize: 2323 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2324 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2325 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2326 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2327 c->reallocs = 0; 2328 C->info.mallocs += 0; 2329 C->info.nz_unneeded = 0; 2330 C->assembled = C->was_assembled = PETSC_TRUE; 2331 C->num_ass++; 2332 PetscFunctionReturn(0); 2333 } 2334 2335 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2336 { 2337 Mat_Product *product = C->product; 2338 Mat A,B; 2339 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2340 Mat_SeqAIJ *a,*b,*c; 2341 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2342 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2343 PetscInt i,j,m,n,k; 2344 PetscBool flg; 2345 PetscErrorCode ierr; 2346 cusparseStatus_t stat; 2347 cudaError_t cerr; 2348 MatProductType ptype; 2349 MatMatCusparse *mmdata; 2350 PetscLogDouble flops; 2351 PetscBool biscompressed,ciscompressed; 2352 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2353 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2354 size_t bufSize2; 2355 cusparseSpMatDescr_t BmatSpDescr; 2356 #else 2357 int cnz; 2358 #endif 2359 2360 PetscFunctionBegin; 2361 MatCheckProduct(C,1); 2362 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2363 A = product->A; 2364 B = product->B; 2365 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2366 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2367 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2368 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2369 a = (Mat_SeqAIJ*)A->data; 2370 b = (Mat_SeqAIJ*)B->data; 2371 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2372 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2373 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2374 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2375 2376 /* product data */ 2377 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2378 C->product->data = mmdata; 2379 C->product->destroy = MatDestroy_MatMatCusparse; 2380 2381 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2382 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2383 ptype = product->type; 2384 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2385 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2386 biscompressed = PETSC_FALSE; 2387 ciscompressed = PETSC_FALSE; 2388 switch (ptype) { 2389 case MATPRODUCT_AB: 2390 m = A->rmap->n; 2391 n = B->cmap->n; 2392 k = A->cmap->n; 2393 Amat = Acusp->mat; 2394 Bmat = Bcusp->mat; 2395 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2396 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2397 break; 2398 case MATPRODUCT_AtB: 2399 m = A->cmap->n; 2400 n = B->cmap->n; 2401 k = A->rmap->n; 2402 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 2403 Amat = Acusp->matTranspose; 2404 Bmat = Bcusp->mat; 2405 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2406 break; 2407 case MATPRODUCT_ABt: 2408 m = A->rmap->n; 2409 n = B->rmap->n; 2410 k = A->cmap->n; 2411 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 2412 Amat = Acusp->mat; 2413 Bmat = Bcusp->matTranspose; 2414 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2415 break; 2416 default: 2417 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2418 } 2419 2420 /* create cusparse matrix */ 2421 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2422 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2423 c = (Mat_SeqAIJ*)C->data; 2424 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2425 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2426 Ccsr = new CsrMatrix; 2427 2428 c->compressedrow.use = ciscompressed; 2429 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2430 c->compressedrow.nrows = a->compressedrow.nrows; 2431 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2432 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2433 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2434 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2435 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2436 } else { 2437 c->compressedrow.nrows = 0; 2438 c->compressedrow.i = NULL; 2439 c->compressedrow.rindex = NULL; 2440 Ccusp->workVector = NULL; 2441 Cmat->cprowIndices = NULL; 2442 } 2443 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2444 Ccusp->mat = Cmat; 2445 Ccusp->mat->mat = Ccsr; 2446 Ccsr->num_rows = Ccusp->nrows; 2447 Ccsr->num_cols = n; 2448 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2449 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2450 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2451 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2452 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2453 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2454 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2455 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2456 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2457 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2458 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2459 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2460 c->nz = 0; 2461 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2462 Ccsr->values = new THRUSTARRAY(c->nz); 2463 goto finalizesym; 2464 } 2465 2466 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2467 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2468 Acsr = (CsrMatrix*)Amat->mat; 2469 if (!biscompressed) { 2470 Bcsr = (CsrMatrix*)Bmat->mat; 2471 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2472 BmatSpDescr = Bmat->matDescr; 2473 #endif 2474 } else { /* we need to use row offsets for the full matrix */ 2475 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2476 Bcsr = new CsrMatrix; 2477 Bcsr->num_rows = B->rmap->n; 2478 Bcsr->num_cols = cBcsr->num_cols; 2479 Bcsr->num_entries = cBcsr->num_entries; 2480 Bcsr->column_indices = cBcsr->column_indices; 2481 Bcsr->values = cBcsr->values; 2482 if (!Bcusp->rowoffsets_gpu) { 2483 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2484 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2485 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2486 } 2487 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2488 mmdata->Bcsr = Bcsr; 2489 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2490 if (Bcsr->num_rows && Bcsr->num_cols) { 2491 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2492 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2493 Bcsr->values->data().get(), 2494 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2495 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2496 } 2497 BmatSpDescr = mmdata->matSpBDescr; 2498 #endif 2499 } 2500 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2501 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2502 /* precompute flops count */ 2503 if (ptype == MATPRODUCT_AB) { 2504 for (i=0, flops = 0; i<A->rmap->n; i++) { 2505 const PetscInt st = a->i[i]; 2506 const PetscInt en = a->i[i+1]; 2507 for (j=st; j<en; j++) { 2508 const PetscInt brow = a->j[j]; 2509 flops += 2.*(b->i[brow+1] - b->i[brow]); 2510 } 2511 } 2512 } else if (ptype == MATPRODUCT_AtB) { 2513 for (i=0, flops = 0; i<A->rmap->n; i++) { 2514 const PetscInt anzi = a->i[i+1] - a->i[i]; 2515 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2516 flops += (2.*anzi)*bnzi; 2517 } 2518 } else { /* TODO */ 2519 flops = 0.; 2520 } 2521 2522 mmdata->flops = flops; 2523 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2524 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2525 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2526 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2527 NULL, NULL, NULL, 2528 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2529 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2530 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2531 /* ask bufferSize bytes for external memory */ 2532 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2533 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2534 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2535 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2536 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2537 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2538 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2539 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2540 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2541 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2542 /* ask bufferSize again bytes for external memory */ 2543 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2544 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2545 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2546 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2547 /* The CUSPARSE documentation is not clear, nor the API 2548 We need both buffers to perform the operations properly! 2549 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2550 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2551 is stored in the descriptor! What a messy API... */ 2552 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2553 /* compute the intermediate product of A * B */ 2554 stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2555 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2556 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2557 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2558 /* get matrix C non-zero entries C_nnz1 */ 2559 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2560 c->nz = (PetscInt) C_nnz1; 2561 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2562 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2563 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2564 Ccsr->values = new THRUSTARRAY(c->nz); 2565 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2566 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2567 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2568 stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2569 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2570 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2571 #else 2572 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2573 stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2574 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2575 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2576 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2577 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2578 c->nz = cnz; 2579 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2580 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2581 Ccsr->values = new THRUSTARRAY(c->nz); 2582 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2583 2584 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2585 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2586 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2587 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2588 stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2589 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2590 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2591 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2592 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2593 #endif 2594 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2595 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2596 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2597 finalizesym: 2598 c->singlemalloc = PETSC_FALSE; 2599 c->free_a = PETSC_TRUE; 2600 c->free_ij = PETSC_TRUE; 2601 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2602 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2603 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2604 PetscInt *d_i = c->i; 2605 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2606 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2607 ii = *Ccsr->row_offsets; 2608 jj = *Ccsr->column_indices; 2609 if (ciscompressed) d_i = c->compressedrow.i; 2610 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2611 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2612 } else { 2613 PetscInt *d_i = c->i; 2614 if (ciscompressed) d_i = c->compressedrow.i; 2615 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2616 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2617 } 2618 if (ciscompressed) { /* need to expand host row offsets */ 2619 PetscInt r = 0; 2620 c->i[0] = 0; 2621 for (k = 0; k < c->compressedrow.nrows; k++) { 2622 const PetscInt next = c->compressedrow.rindex[k]; 2623 const PetscInt old = c->compressedrow.i[k]; 2624 for (; r < next; r++) c->i[r+1] = old; 2625 } 2626 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2627 } 2628 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2629 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2630 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2631 c->maxnz = c->nz; 2632 c->nonzerorowcnt = 0; 2633 c->rmax = 0; 2634 for (k = 0; k < m; k++) { 2635 const PetscInt nn = c->i[k+1] - c->i[k]; 2636 c->ilen[k] = c->imax[k] = nn; 2637 c->nonzerorowcnt += (PetscInt)!!nn; 2638 c->rmax = PetscMax(c->rmax,nn); 2639 } 2640 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2641 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2642 Ccsr->num_entries = c->nz; 2643 2644 C->nonzerostate++; 2645 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2646 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2647 Ccusp->nonzerostate = C->nonzerostate; 2648 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2649 C->preallocated = PETSC_TRUE; 2650 C->assembled = PETSC_FALSE; 2651 C->was_assembled = PETSC_FALSE; 2652 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2653 mmdata->reusesym = PETSC_TRUE; 2654 C->offloadmask = PETSC_OFFLOAD_GPU; 2655 } 2656 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2657 PetscFunctionReturn(0); 2658 } 2659 2660 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2661 2662 /* handles sparse or dense B */ 2663 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2664 { 2665 Mat_Product *product = mat->product; 2666 PetscErrorCode ierr; 2667 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2668 2669 PetscFunctionBegin; 2670 MatCheckProduct(mat,1); 2671 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2672 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2673 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2674 } 2675 if (product->type == MATPRODUCT_ABC) { 2676 Ciscusp = PETSC_FALSE; 2677 if (!product->C->boundtocpu) { 2678 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2679 } 2680 } 2681 if (isdense) { 2682 switch (product->type) { 2683 case MATPRODUCT_AB: 2684 case MATPRODUCT_AtB: 2685 case MATPRODUCT_ABt: 2686 case MATPRODUCT_PtAP: 2687 case MATPRODUCT_RARt: 2688 if (product->A->boundtocpu) { 2689 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2690 } else { 2691 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2692 } 2693 break; 2694 case MATPRODUCT_ABC: 2695 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2696 break; 2697 default: 2698 break; 2699 } 2700 } else if (Biscusp && Ciscusp) { 2701 switch (product->type) { 2702 case MATPRODUCT_AB: 2703 case MATPRODUCT_AtB: 2704 case MATPRODUCT_ABt: 2705 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2706 break; 2707 case MATPRODUCT_PtAP: 2708 case MATPRODUCT_RARt: 2709 case MATPRODUCT_ABC: 2710 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2711 break; 2712 default: 2713 break; 2714 } 2715 } else { /* fallback for AIJ */ 2716 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2717 } 2718 PetscFunctionReturn(0); 2719 } 2720 2721 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2722 { 2723 PetscErrorCode ierr; 2724 2725 PetscFunctionBegin; 2726 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2727 PetscFunctionReturn(0); 2728 } 2729 2730 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2731 { 2732 PetscErrorCode ierr; 2733 2734 PetscFunctionBegin; 2735 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2736 PetscFunctionReturn(0); 2737 } 2738 2739 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2740 { 2741 PetscErrorCode ierr; 2742 2743 PetscFunctionBegin; 2744 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2745 PetscFunctionReturn(0); 2746 } 2747 2748 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2749 { 2750 PetscErrorCode ierr; 2751 2752 PetscFunctionBegin; 2753 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2754 PetscFunctionReturn(0); 2755 } 2756 2757 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2758 { 2759 PetscErrorCode ierr; 2760 2761 PetscFunctionBegin; 2762 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2763 PetscFunctionReturn(0); 2764 } 2765 2766 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2767 { 2768 int i = blockIdx.x*blockDim.x + threadIdx.x; 2769 if (i < n) y[idx[i]] += x[i]; 2770 } 2771 2772 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2773 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2774 { 2775 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2776 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2777 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2778 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2779 PetscErrorCode ierr; 2780 cudaError_t cerr; 2781 cusparseStatus_t stat; 2782 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2783 PetscBool compressed; 2784 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2785 PetscInt nx,ny; 2786 #endif 2787 2788 PetscFunctionBegin; 2789 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2790 if (!a->nonzerorowcnt) { 2791 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2792 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2793 PetscFunctionReturn(0); 2794 } 2795 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2796 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2797 if (!trans) { 2798 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2799 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2800 } else { 2801 if (herm || !A->form_explicit_transpose) { 2802 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2803 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2804 } else { 2805 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);} 2806 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2807 } 2808 } 2809 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2810 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2811 2812 try { 2813 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2814 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2815 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2816 2817 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2818 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2819 /* z = A x + beta y. 2820 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2821 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2822 */ 2823 xptr = xarray; 2824 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2825 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2826 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2827 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2828 allocated to accommodate different uses. So we get the length info directly from mat. 2829 */ 2830 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2831 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2832 nx = mat->num_cols; 2833 ny = mat->num_rows; 2834 } 2835 #endif 2836 } else { 2837 /* z = A^T x + beta y 2838 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2839 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2840 */ 2841 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2842 dptr = zarray; 2843 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2844 if (compressed) { /* Scatter x to work vector */ 2845 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2846 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2847 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2848 VecCUDAEqualsReverse()); 2849 } 2850 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2851 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2852 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2853 nx = mat->num_rows; 2854 ny = mat->num_cols; 2855 } 2856 #endif 2857 } 2858 2859 /* csr_spmv does y = alpha op(A) x + beta y */ 2860 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2861 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2862 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2863 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2864 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2865 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2866 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2867 matstruct->matDescr, 2868 matstruct->cuSpMV[opA].vecXDescr, beta, 2869 matstruct->cuSpMV[opA].vecYDescr, 2870 cusparse_scalartype, 2871 cusparsestruct->spmvAlg, 2872 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2873 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2874 2875 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2876 } else { 2877 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2878 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2879 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2880 } 2881 2882 stat = cusparseSpMV(cusparsestruct->handle, opA, 2883 matstruct->alpha_one, 2884 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */ 2885 matstruct->cuSpMV[opA].vecXDescr, 2886 beta, 2887 matstruct->cuSpMV[opA].vecYDescr, 2888 cusparse_scalartype, 2889 cusparsestruct->spmvAlg, 2890 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2891 #else 2892 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2893 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2894 mat->num_rows, mat->num_cols, 2895 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2896 mat->values->data().get(), mat->row_offsets->data().get(), 2897 mat->column_indices->data().get(), xptr, beta, 2898 dptr);CHKERRCUSPARSE(stat); 2899 #endif 2900 } else { 2901 if (cusparsestruct->nrows) { 2902 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2903 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2904 #else 2905 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2906 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2907 matstruct->alpha_one, matstruct->descr, hybMat, 2908 xptr, beta, 2909 dptr);CHKERRCUSPARSE(stat); 2910 #endif 2911 } 2912 } 2913 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2914 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2915 2916 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2917 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2918 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2919 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2920 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2921 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2922 } 2923 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2924 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2925 } 2926 2927 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2928 if (compressed) { 2929 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2930 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 2931 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 2932 prevent that. So I just add a ScatterAdd kernel. 2933 */ 2934 #if 0 2935 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2936 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 2937 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2938 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2939 VecCUDAPlusEquals()); 2940 #else 2941 PetscInt n = matstruct->cprowIndices->size(); 2942 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 2943 #endif 2944 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2945 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2946 } 2947 } else { 2948 if (yy && yy != zz) { 2949 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2950 } 2951 } 2952 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2953 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2954 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2955 } catch(char *ex) { 2956 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2957 } 2958 if (yy) { 2959 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2960 } else { 2961 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2962 } 2963 PetscFunctionReturn(0); 2964 } 2965 2966 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2967 { 2968 PetscErrorCode ierr; 2969 2970 PetscFunctionBegin; 2971 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2972 PetscFunctionReturn(0); 2973 } 2974 2975 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2976 { 2977 PetscErrorCode ierr; 2978 PetscSplitCSRDataStructure *d_mat = NULL; 2979 PetscFunctionBegin; 2980 if (A->factortype == MAT_FACTOR_NONE) { 2981 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2982 } 2983 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2984 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2985 if (d_mat) { 2986 A->offloadmask = PETSC_OFFLOAD_GPU; 2987 } 2988 2989 PetscFunctionReturn(0); 2990 } 2991 2992 /* --------------------------------------------------------------------------------*/ 2993 /*@ 2994 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2995 (the default parallel PETSc format). This matrix will ultimately pushed down 2996 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2997 assembly performance the user should preallocate the matrix storage by setting 2998 the parameter nz (or the array nnz). By setting these parameters accurately, 2999 performance during matrix assembly can be increased by more than a factor of 50. 3000 3001 Collective 3002 3003 Input Parameters: 3004 + comm - MPI communicator, set to PETSC_COMM_SELF 3005 . m - number of rows 3006 . n - number of columns 3007 . nz - number of nonzeros per row (same for all rows) 3008 - nnz - array containing the number of nonzeros in the various rows 3009 (possibly different for each row) or NULL 3010 3011 Output Parameter: 3012 . A - the matrix 3013 3014 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3015 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3016 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3017 3018 Notes: 3019 If nnz is given then nz is ignored 3020 3021 The AIJ format (also called the Yale sparse matrix format or 3022 compressed row storage), is fully compatible with standard Fortran 77 3023 storage. That is, the stored row and column indices can begin at 3024 either one (as in Fortran) or zero. See the users' manual for details. 3025 3026 Specify the preallocated storage with either nz or nnz (not both). 3027 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3028 allocation. For large problems you MUST preallocate memory or you 3029 will get TERRIBLE performance, see the users' manual chapter on matrices. 3030 3031 By default, this format uses inodes (identical nodes) when possible, to 3032 improve numerical efficiency of matrix-vector products and solves. We 3033 search for consecutive rows with the same nonzero structure, thereby 3034 reusing matrix information to achieve increased efficiency. 3035 3036 Level: intermediate 3037 3038 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3039 @*/ 3040 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3041 { 3042 PetscErrorCode ierr; 3043 3044 PetscFunctionBegin; 3045 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3046 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3047 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3048 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3049 PetscFunctionReturn(0); 3050 } 3051 3052 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3053 { 3054 PetscErrorCode ierr; 3055 PetscSplitCSRDataStructure *d_mat = NULL; 3056 3057 PetscFunctionBegin; 3058 if (A->factortype == MAT_FACTOR_NONE) { 3059 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 3060 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3061 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3062 } else { 3063 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3064 } 3065 if (d_mat) { 3066 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3067 cudaError_t err; 3068 PetscSplitCSRDataStructure h_mat; 3069 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 3070 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3071 if (a->compressedrow.use) { 3072 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 3073 } 3074 err = cudaFree(d_mat);CHKERRCUDA(err); 3075 } 3076 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3077 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3078 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3079 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3080 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3081 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3082 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3083 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3084 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3085 PetscFunctionReturn(0); 3086 } 3087 3088 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3089 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3090 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3091 { 3092 PetscErrorCode ierr; 3093 3094 PetscFunctionBegin; 3095 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3096 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3097 PetscFunctionReturn(0); 3098 } 3099 3100 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3101 { 3102 PetscErrorCode ierr; 3103 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3104 Mat_SeqAIJCUSPARSE *cy; 3105 Mat_SeqAIJCUSPARSE *cx; 3106 PetscScalar *ay; 3107 const PetscScalar *ax; 3108 CsrMatrix *csry,*csrx; 3109 cudaError_t cerr; 3110 3111 PetscFunctionBegin; 3112 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3113 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3114 if (X->ops->axpy != Y->ops->axpy) { 3115 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3116 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3117 PetscFunctionReturn(0); 3118 } 3119 /* if we are here, it means both matrices are bound to GPU */ 3120 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3121 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3122 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3123 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3124 csry = (CsrMatrix*)cy->mat->mat; 3125 csrx = (CsrMatrix*)cx->mat->mat; 3126 /* see if we can turn this into a cublas axpy */ 3127 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3128 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3129 if (eq) { 3130 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3131 } 3132 if (eq) str = SAME_NONZERO_PATTERN; 3133 } 3134 /* spgeam is buggy with one column */ 3135 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3136 3137 if (str == SUBSET_NONZERO_PATTERN) { 3138 cusparseStatus_t stat; 3139 PetscScalar b = 1.0; 3140 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3141 size_t bufferSize; 3142 void *buffer; 3143 #endif 3144 3145 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3146 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3147 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3148 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3149 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3150 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3151 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3152 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3153 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3154 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3155 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3156 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3157 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3158 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3159 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3160 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3161 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3162 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3163 #else 3164 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3165 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3166 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3167 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3168 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3169 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3170 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3171 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3172 #endif 3173 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3174 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3175 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3176 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3177 } else if (str == SAME_NONZERO_PATTERN) { 3178 cublasHandle_t cublasv2handle; 3179 cublasStatus_t berr; 3180 PetscBLASInt one = 1, bnz = 1; 3181 3182 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3183 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3184 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3185 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3186 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3187 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3188 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3189 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3190 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3191 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3192 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3193 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3194 } else { 3195 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3196 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3197 } 3198 PetscFunctionReturn(0); 3199 } 3200 3201 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3202 { 3203 PetscErrorCode ierr; 3204 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3205 PetscScalar *ay; 3206 cudaError_t cerr; 3207 cublasHandle_t cublasv2handle; 3208 cublasStatus_t berr; 3209 PetscBLASInt one = 1, bnz = 1; 3210 3211 PetscFunctionBegin; 3212 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3213 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3214 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3215 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3216 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3217 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3218 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3219 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3220 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3221 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3222 PetscFunctionReturn(0); 3223 } 3224 3225 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3226 { 3227 PetscErrorCode ierr; 3228 PetscBool both = PETSC_FALSE; 3229 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3230 3231 PetscFunctionBegin; 3232 if (A->factortype == MAT_FACTOR_NONE) { 3233 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3234 if (spptr->mat) { 3235 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3236 if (matrix->values) { 3237 both = PETSC_TRUE; 3238 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3239 } 3240 } 3241 if (spptr->matTranspose) { 3242 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3243 if (matrix->values) { 3244 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3245 } 3246 } 3247 } 3248 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3249 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3250 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3251 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3252 else A->offloadmask = PETSC_OFFLOAD_CPU; 3253 3254 PetscFunctionReturn(0); 3255 } 3256 3257 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3258 { 3259 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3260 PetscErrorCode ierr; 3261 3262 PetscFunctionBegin; 3263 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3264 if (flg) { 3265 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3266 3267 A->ops->scale = MatScale_SeqAIJ; 3268 A->ops->axpy = MatAXPY_SeqAIJ; 3269 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3270 A->ops->mult = MatMult_SeqAIJ; 3271 A->ops->multadd = MatMultAdd_SeqAIJ; 3272 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3273 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3274 A->ops->multhermitiantranspose = NULL; 3275 A->ops->multhermitiantransposeadd = NULL; 3276 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3277 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3278 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3279 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3280 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3281 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3282 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3283 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3284 } else { 3285 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3286 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3287 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3288 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3289 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3290 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3291 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3292 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3293 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3294 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3295 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3296 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3297 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3298 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3299 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3300 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3301 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3302 } 3303 A->boundtocpu = flg; 3304 a->inode.use = flg; 3305 PetscFunctionReturn(0); 3306 } 3307 3308 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3309 { 3310 PetscErrorCode ierr; 3311 cusparseStatus_t stat; 3312 Mat B; 3313 3314 PetscFunctionBegin; 3315 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3316 if (reuse == MAT_INITIAL_MATRIX) { 3317 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3318 } else if (reuse == MAT_REUSE_MATRIX) { 3319 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3320 } 3321 B = *newmat; 3322 3323 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3324 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3325 3326 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3327 if (B->factortype == MAT_FACTOR_NONE) { 3328 Mat_SeqAIJCUSPARSE *spptr; 3329 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3330 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3331 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3332 spptr->format = MAT_CUSPARSE_CSR; 3333 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3334 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3335 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3336 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3337 #endif 3338 B->spptr = spptr; 3339 } else { 3340 Mat_SeqAIJCUSPARSETriFactors *spptr; 3341 3342 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3343 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3344 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3345 B->spptr = spptr; 3346 } 3347 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3348 } 3349 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3350 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3351 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3352 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3353 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3354 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3355 3356 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3357 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3358 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3359 PetscFunctionReturn(0); 3360 } 3361 3362 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3363 { 3364 PetscErrorCode ierr; 3365 3366 PetscFunctionBegin; 3367 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3368 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3369 PetscFunctionReturn(0); 3370 } 3371 3372 /*MC 3373 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3374 3375 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3376 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3377 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3378 3379 Options Database Keys: 3380 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3381 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3382 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3383 3384 Level: beginner 3385 3386 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3387 M*/ 3388 3389 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3390 3391 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3392 { 3393 PetscErrorCode ierr; 3394 3395 PetscFunctionBegin; 3396 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3397 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3398 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3399 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3400 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3401 3402 PetscFunctionReturn(0); 3403 } 3404 3405 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3406 { 3407 PetscErrorCode ierr; 3408 cusparseStatus_t stat; 3409 3410 PetscFunctionBegin; 3411 if (*cusparsestruct) { 3412 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3413 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3414 delete (*cusparsestruct)->workVector; 3415 delete (*cusparsestruct)->rowoffsets_gpu; 3416 delete (*cusparsestruct)->cooPerm; 3417 delete (*cusparsestruct)->cooPerm_a; 3418 delete (*cusparsestruct)->csr2csc_i; 3419 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3420 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3421 } 3422 PetscFunctionReturn(0); 3423 } 3424 3425 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3426 { 3427 PetscFunctionBegin; 3428 if (*mat) { 3429 delete (*mat)->values; 3430 delete (*mat)->column_indices; 3431 delete (*mat)->row_offsets; 3432 delete *mat; 3433 *mat = 0; 3434 } 3435 PetscFunctionReturn(0); 3436 } 3437 3438 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3439 { 3440 cusparseStatus_t stat; 3441 PetscErrorCode ierr; 3442 3443 PetscFunctionBegin; 3444 if (*trifactor) { 3445 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3446 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3447 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3448 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3449 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3451 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3452 #endif 3453 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3454 } 3455 PetscFunctionReturn(0); 3456 } 3457 3458 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3459 { 3460 CsrMatrix *mat; 3461 cusparseStatus_t stat; 3462 cudaError_t err; 3463 3464 PetscFunctionBegin; 3465 if (*matstruct) { 3466 if ((*matstruct)->mat) { 3467 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3468 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3469 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3470 #else 3471 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3472 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3473 #endif 3474 } else { 3475 mat = (CsrMatrix*)(*matstruct)->mat; 3476 CsrMatrix_Destroy(&mat); 3477 } 3478 } 3479 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3480 delete (*matstruct)->cprowIndices; 3481 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3482 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3483 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3484 3485 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3486 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3487 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3488 for (int i=0; i<3; i++) { 3489 if (mdata->cuSpMV[i].initialized) { 3490 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3491 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3492 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3493 } 3494 } 3495 #endif 3496 delete *matstruct; 3497 *matstruct = NULL; 3498 } 3499 PetscFunctionReturn(0); 3500 } 3501 3502 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3503 { 3504 PetscErrorCode ierr; 3505 3506 PetscFunctionBegin; 3507 if (*trifactors) { 3508 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3509 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3510 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3511 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3512 delete (*trifactors)->rpermIndices; 3513 delete (*trifactors)->cpermIndices; 3514 delete (*trifactors)->workVector; 3515 (*trifactors)->rpermIndices = NULL; 3516 (*trifactors)->cpermIndices = NULL; 3517 (*trifactors)->workVector = NULL; 3518 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3519 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3520 } 3521 PetscFunctionReturn(0); 3522 } 3523 3524 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3525 { 3526 PetscErrorCode ierr; 3527 cusparseHandle_t handle; 3528 cusparseStatus_t stat; 3529 3530 PetscFunctionBegin; 3531 if (*trifactors) { 3532 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3533 if (handle = (*trifactors)->handle) { 3534 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3535 } 3536 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3537 } 3538 PetscFunctionReturn(0); 3539 } 3540 3541 struct IJCompare 3542 { 3543 __host__ __device__ 3544 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3545 { 3546 if (t1.get<0>() < t2.get<0>()) return true; 3547 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3548 return false; 3549 } 3550 }; 3551 3552 struct IJEqual 3553 { 3554 __host__ __device__ 3555 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3556 { 3557 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3558 return true; 3559 } 3560 }; 3561 3562 struct IJDiff 3563 { 3564 __host__ __device__ 3565 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3566 { 3567 return t1 == t2 ? 0 : 1; 3568 } 3569 }; 3570 3571 struct IJSum 3572 { 3573 __host__ __device__ 3574 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3575 { 3576 return t1||t2; 3577 } 3578 }; 3579 3580 #include <thrust/iterator/discard_iterator.h> 3581 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3582 { 3583 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3585 THRUSTARRAY *cooPerm_v = NULL; 3586 thrust::device_ptr<const PetscScalar> d_v; 3587 CsrMatrix *matrix; 3588 PetscErrorCode ierr; 3589 cudaError_t cerr; 3590 PetscInt n; 3591 3592 PetscFunctionBegin; 3593 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3594 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3595 if (!cusp->cooPerm) { 3596 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3597 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3598 PetscFunctionReturn(0); 3599 } 3600 matrix = (CsrMatrix*)cusp->mat->mat; 3601 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3602 if (!v) { 3603 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3604 goto finalize; 3605 } 3606 n = cusp->cooPerm->size(); 3607 if (isCudaMem(v)) { 3608 d_v = thrust::device_pointer_cast(v); 3609 } else { 3610 cooPerm_v = new THRUSTARRAY(n); 3611 cooPerm_v->assign(v,v+n); 3612 d_v = cooPerm_v->data(); 3613 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3614 } 3615 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3616 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3617 if (cusp->cooPerm_a) { 3618 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3619 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3620 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3621 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3622 delete cooPerm_w; 3623 } else { 3624 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3625 matrix->values->begin())); 3626 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3627 matrix->values->end())); 3628 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 3629 } 3630 } else { 3631 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3632 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3633 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3634 } else { 3635 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3636 matrix->values->begin())); 3637 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3638 matrix->values->end())); 3639 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3640 } 3641 } 3642 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3643 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3644 finalize: 3645 delete cooPerm_v; 3646 A->offloadmask = PETSC_OFFLOAD_GPU; 3647 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3648 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3649 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3650 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3651 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3652 a->reallocs = 0; 3653 A->info.mallocs += 0; 3654 A->info.nz_unneeded = 0; 3655 A->assembled = A->was_assembled = PETSC_TRUE; 3656 A->num_ass++; 3657 PetscFunctionReturn(0); 3658 } 3659 3660 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3661 { 3662 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3663 PetscErrorCode ierr; 3664 3665 PetscFunctionBegin; 3666 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3667 if (!cusp) PetscFunctionReturn(0); 3668 if (destroy) { 3669 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3670 delete cusp->csr2csc_i; 3671 cusp->csr2csc_i = NULL; 3672 } 3673 A->transupdated = PETSC_FALSE; 3674 PetscFunctionReturn(0); 3675 } 3676 3677 #include <thrust/binary_search.h> 3678 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3679 { 3680 PetscErrorCode ierr; 3681 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3682 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3683 PetscInt cooPerm_n, nzr = 0; 3684 cudaError_t cerr; 3685 3686 PetscFunctionBegin; 3687 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3688 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3689 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3690 if (n != cooPerm_n) { 3691 delete cusp->cooPerm; 3692 delete cusp->cooPerm_a; 3693 cusp->cooPerm = NULL; 3694 cusp->cooPerm_a = NULL; 3695 } 3696 if (n) { 3697 THRUSTINTARRAY d_i(n); 3698 THRUSTINTARRAY d_j(n); 3699 THRUSTINTARRAY ii(A->rmap->n); 3700 3701 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3702 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3703 3704 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3705 d_i.assign(coo_i,coo_i+n); 3706 d_j.assign(coo_j,coo_j+n); 3707 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3708 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3709 3710 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3711 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3712 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 3713 *cusp->cooPerm_a = d_i; 3714 THRUSTINTARRAY w = d_j; 3715 3716 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 3717 if (nekey == ekey) { /* all entries are unique */ 3718 delete cusp->cooPerm_a; 3719 cusp->cooPerm_a = NULL; 3720 } else { /* I couldn't come up with a more elegant algorithm */ 3721 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 3722 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 3723 (*cusp->cooPerm_a)[0] = 0; 3724 w[0] = 0; 3725 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 3726 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 3727 } 3728 thrust::counting_iterator<PetscInt> search_begin(0); 3729 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 3730 search_begin, search_begin + A->rmap->n, 3731 ii.begin()); 3732 cerr = WaitForCUDA();CHKERRCUDA(cerr); 3733 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3734 3735 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3736 a->singlemalloc = PETSC_FALSE; 3737 a->free_a = PETSC_TRUE; 3738 a->free_ij = PETSC_TRUE; 3739 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3740 a->i[0] = 0; 3741 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3742 a->nz = a->maxnz = a->i[A->rmap->n]; 3743 a->rmax = 0; 3744 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3745 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3746 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3747 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3748 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3749 for (PetscInt i = 0; i < A->rmap->n; i++) { 3750 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3751 nzr += (PetscInt)!!(nnzr); 3752 a->ilen[i] = a->imax[i] = nnzr; 3753 a->rmax = PetscMax(a->rmax,nnzr); 3754 } 3755 a->nonzerorowcnt = nzr; 3756 A->preallocated = PETSC_TRUE; 3757 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3758 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3759 } else { 3760 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3761 } 3762 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3763 3764 /* We want to allocate the CUSPARSE struct for matvec now. 3765 The code is so convoluted now that I prefer to copy zeros */ 3766 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3767 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3768 A->offloadmask = PETSC_OFFLOAD_CPU; 3769 A->nonzerostate++; 3770 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3771 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3772 3773 A->assembled = PETSC_FALSE; 3774 A->was_assembled = PETSC_FALSE; 3775 PetscFunctionReturn(0); 3776 } 3777 3778 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3779 { 3780 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3781 CsrMatrix *csr; 3782 PetscErrorCode ierr; 3783 3784 PetscFunctionBegin; 3785 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3786 PetscValidPointer(a,2); 3787 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3788 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3789 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3790 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3791 csr = (CsrMatrix*)cusp->mat->mat; 3792 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3793 *a = csr->values->data().get(); 3794 PetscFunctionReturn(0); 3795 } 3796 3797 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3798 { 3799 PetscFunctionBegin; 3800 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3801 PetscValidPointer(a,2); 3802 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3803 *a = NULL; 3804 PetscFunctionReturn(0); 3805 } 3806 3807 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 3808 { 3809 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3810 CsrMatrix *csr; 3811 PetscErrorCode ierr; 3812 3813 PetscFunctionBegin; 3814 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3815 PetscValidPointer(a,2); 3816 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3817 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3818 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3819 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3820 csr = (CsrMatrix*)cusp->mat->mat; 3821 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3822 *a = csr->values->data().get(); 3823 A->offloadmask = PETSC_OFFLOAD_GPU; 3824 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3825 PetscFunctionReturn(0); 3826 } 3827 3828 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 3829 { 3830 PetscErrorCode ierr; 3831 3832 PetscFunctionBegin; 3833 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3834 PetscValidPointer(a,2); 3835 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3836 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3837 *a = NULL; 3838 PetscFunctionReturn(0); 3839 } 3840 3841 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3842 { 3843 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3844 CsrMatrix *csr; 3845 PetscErrorCode ierr; 3846 3847 PetscFunctionBegin; 3848 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3849 PetscValidPointer(a,2); 3850 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3851 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3852 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3853 csr = (CsrMatrix*)cusp->mat->mat; 3854 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3855 *a = csr->values->data().get(); 3856 A->offloadmask = PETSC_OFFLOAD_GPU; 3857 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 3858 PetscFunctionReturn(0); 3859 } 3860 3861 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3862 { 3863 PetscErrorCode ierr; 3864 3865 PetscFunctionBegin; 3866 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3867 PetscValidPointer(a,2); 3868 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3869 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3870 *a = NULL; 3871 PetscFunctionReturn(0); 3872 } 3873 3874 struct IJCompare4 3875 { 3876 __host__ __device__ 3877 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 3878 { 3879 if (t1.get<0>() < t2.get<0>()) return true; 3880 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3881 return false; 3882 } 3883 }; 3884 3885 struct Shift 3886 { 3887 int _shift; 3888 3889 Shift(int shift) : _shift(shift) {} 3890 __host__ __device__ 3891 inline int operator() (const int &c) 3892 { 3893 return c + _shift; 3894 } 3895 }; 3896 3897 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3898 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3899 { 3900 PetscErrorCode ierr; 3901 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3902 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3903 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3904 CsrMatrix *Acsr,*Bcsr,*Ccsr; 3905 PetscInt Annz,Bnnz; 3906 cusparseStatus_t stat; 3907 PetscInt i,m,n,zero = 0; 3908 cudaError_t cerr; 3909 3910 PetscFunctionBegin; 3911 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3912 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3913 PetscValidPointer(C,4); 3914 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3915 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3916 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3917 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3918 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3919 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3920 if (reuse == MAT_INITIAL_MATRIX) { 3921 m = A->rmap->n; 3922 n = A->cmap->n + B->cmap->n; 3923 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3924 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3925 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3926 c = (Mat_SeqAIJ*)(*C)->data; 3927 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3928 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3929 Ccsr = new CsrMatrix; 3930 Cmat->cprowIndices = NULL; 3931 c->compressedrow.use = PETSC_FALSE; 3932 c->compressedrow.nrows = 0; 3933 c->compressedrow.i = NULL; 3934 c->compressedrow.rindex = NULL; 3935 Ccusp->workVector = NULL; 3936 Ccusp->nrows = m; 3937 Ccusp->mat = Cmat; 3938 Ccusp->mat->mat = Ccsr; 3939 Ccsr->num_rows = m; 3940 Ccsr->num_cols = n; 3941 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3942 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3943 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3944 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3945 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3946 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3947 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3948 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3949 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3950 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3951 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3952 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr); 3953 ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr); 3954 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3955 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3956 3957 Acsr = (CsrMatrix*)Acusp->mat->mat; 3958 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3959 Annz = (PetscInt)Acsr->column_indices->size(); 3960 Bnnz = (PetscInt)Bcsr->column_indices->size(); 3961 c->nz = Annz + Bnnz; 3962 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3963 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3964 Ccsr->values = new THRUSTARRAY(c->nz); 3965 Ccsr->num_entries = c->nz; 3966 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3967 if (c->nz) { 3968 auto Acoo = new THRUSTINTARRAY32(Annz); 3969 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 3970 auto Ccoo = new THRUSTINTARRAY32(c->nz); 3971 THRUSTINTARRAY32 *Aroff,*Broff; 3972 3973 if (a->compressedrow.use) { /* need full row offset */ 3974 if (!Acusp->rowoffsets_gpu) { 3975 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3976 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3977 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3978 } 3979 Aroff = Acusp->rowoffsets_gpu; 3980 } else Aroff = Acsr->row_offsets; 3981 if (b->compressedrow.use) { /* need full row offset */ 3982 if (!Bcusp->rowoffsets_gpu) { 3983 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3984 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3985 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3986 } 3987 Broff = Bcusp->rowoffsets_gpu; 3988 } else Broff = Bcsr->row_offsets; 3989 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3990 stat = cusparseXcsr2coo(Acusp->handle, 3991 Aroff->data().get(), 3992 Annz, 3993 m, 3994 Acoo->data().get(), 3995 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3996 stat = cusparseXcsr2coo(Bcusp->handle, 3997 Broff->data().get(), 3998 Bnnz, 3999 m, 4000 Bcoo->data().get(), 4001 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4002 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4003 auto Aperm = thrust::make_constant_iterator(1); 4004 auto Bperm = thrust::make_constant_iterator(0); 4005 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4006 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4007 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4008 #else 4009 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4010 auto Bcib = Bcsr->column_indices->begin(); 4011 auto Bcie = Bcsr->column_indices->end(); 4012 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4013 #endif 4014 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4015 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4016 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4017 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4018 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4019 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4020 auto p1 = Ccusp->cooPerm->begin(); 4021 auto p2 = Ccusp->cooPerm->begin(); 4022 thrust::advance(p2,Annz); 4023 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4024 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4025 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4026 #endif 4027 auto cci = thrust::make_counting_iterator(zero); 4028 auto cce = thrust::make_counting_iterator(c->nz); 4029 #if 0 //Errors on SUMMIT cuda 11.1.0 4030 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4031 #else 4032 auto pred = thrust::identity<int>(); 4033 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4034 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4035 #endif 4036 stat = cusparseXcoo2csr(Ccusp->handle, 4037 Ccoo->data().get(), 4038 c->nz, 4039 m, 4040 Ccsr->row_offsets->data().get(), 4041 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4042 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4043 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4044 delete wPerm; 4045 delete Acoo; 4046 delete Bcoo; 4047 delete Ccoo; 4048 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4049 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4050 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4051 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4052 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4053 #endif 4054 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4055 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4056 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4057 CsrMatrix *CcsrT = new CsrMatrix; 4058 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4059 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4060 4061 (*C)->form_explicit_transpose = PETSC_TRUE; 4062 (*C)->transupdated = PETSC_TRUE; 4063 Ccusp->rowoffsets_gpu = NULL; 4064 CmatT->cprowIndices = NULL; 4065 CmatT->mat = CcsrT; 4066 CcsrT->num_rows = n; 4067 CcsrT->num_cols = m; 4068 CcsrT->num_entries = c->nz; 4069 4070 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4071 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4072 CcsrT->values = new THRUSTARRAY(c->nz); 4073 4074 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4075 auto rT = CcsrT->row_offsets->begin(); 4076 if (AT) { 4077 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4078 thrust::advance(rT,-1); 4079 } 4080 if (BT) { 4081 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4082 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4083 thrust::copy(titb,tite,rT); 4084 } 4085 auto cT = CcsrT->column_indices->begin(); 4086 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4087 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4088 auto vT = CcsrT->values->begin(); 4089 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4090 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4091 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4092 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4093 4094 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4095 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4096 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4097 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4098 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4099 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4100 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4101 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4102 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4103 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4104 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4105 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4106 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4107 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4108 #endif 4109 Ccusp->matTranspose = CmatT; 4110 } 4111 } 4112 4113 c->singlemalloc = PETSC_FALSE; 4114 c->free_a = PETSC_TRUE; 4115 c->free_ij = PETSC_TRUE; 4116 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4117 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4118 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4119 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4120 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4121 ii = *Ccsr->row_offsets; 4122 jj = *Ccsr->column_indices; 4123 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4124 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4125 } else { 4126 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4127 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4128 } 4129 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4130 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4131 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4132 c->maxnz = c->nz; 4133 c->nonzerorowcnt = 0; 4134 c->rmax = 0; 4135 for (i = 0; i < m; i++) { 4136 const PetscInt nn = c->i[i+1] - c->i[i]; 4137 c->ilen[i] = c->imax[i] = nn; 4138 c->nonzerorowcnt += (PetscInt)!!nn; 4139 c->rmax = PetscMax(c->rmax,nn); 4140 } 4141 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4142 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4143 (*C)->nonzerostate++; 4144 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4145 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4146 Ccusp->nonzerostate = (*C)->nonzerostate; 4147 (*C)->preallocated = PETSC_TRUE; 4148 } else { 4149 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4150 c = (Mat_SeqAIJ*)(*C)->data; 4151 if (c->nz) { 4152 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4153 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4154 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4155 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4156 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4157 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4158 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4159 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4160 Acsr = (CsrMatrix*)Acusp->mat->mat; 4161 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4162 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4163 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4164 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4165 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4166 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4167 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4168 auto pmid = Ccusp->cooPerm->begin(); 4169 thrust::advance(pmid,Acsr->num_entries); 4170 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4171 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4172 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4173 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4174 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4175 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4176 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4177 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4178 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4179 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4180 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4181 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4182 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4183 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4184 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4185 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4186 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4187 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4188 auto vT = CcsrT->values->begin(); 4189 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4190 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4191 (*C)->transupdated = PETSC_TRUE; 4192 } 4193 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4194 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4195 } 4196 } 4197 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4198 (*C)->assembled = PETSC_TRUE; 4199 (*C)->was_assembled = PETSC_FALSE; 4200 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4201 PetscFunctionReturn(0); 4202 } 4203 4204 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4205 { 4206 PetscErrorCode ierr; 4207 bool dmem; 4208 const PetscScalar *av; 4209 cudaError_t cerr; 4210 4211 PetscFunctionBegin; 4212 dmem = isCudaMem(v); 4213 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4214 if (n && idx) { 4215 THRUSTINTARRAY widx(n); 4216 widx.assign(idx,idx+n); 4217 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4218 4219 THRUSTARRAY *w = NULL; 4220 thrust::device_ptr<PetscScalar> dv; 4221 if (dmem) { 4222 dv = thrust::device_pointer_cast(v); 4223 } else { 4224 w = new THRUSTARRAY(n); 4225 dv = w->data(); 4226 } 4227 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4228 4229 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4230 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4231 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4232 if (w) { 4233 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4234 } 4235 delete w; 4236 } else { 4237 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4238 } 4239 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4240 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4241 PetscFunctionReturn(0); 4242 } 4243 4244 /* 4245 LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields) 4246 4247 requires: 4248 structurally symmetric: fix with transpose/column meta data 4249 */ 4250 4251 /* 4252 The GPU LU factor kernel 4253 */ 4254 __global__ 4255 void __launch_bounds__(1024,1) 4256 mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[]) 4257 { 4258 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4259 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4260 const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 4261 4262 // set i (row+1) 4263 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero 4264 // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block 4265 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4266 if (rowb < end_i && threadIdx.x==0) { 4267 PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0; 4268 bi_csr[rowb+1] = n1L + nug - clip + n2L + i; 4269 } 4270 } 4271 } 4272 // copy AIJ to AIJ_BAND 4273 __global__ 4274 void __launch_bounds__(1024,1) 4275 mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[], 4276 const int ai_d[], const int aj_d[], const PetscScalar aa_d[], 4277 const int bi_csr[], PetscScalar ba_csr[]) 4278 { 4279 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4280 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4281 const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 4282 4283 // zero B 4284 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end 4285 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4286 if (rowb < end_i) { 4287 PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4288 const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb]; 4289 for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) { 4290 if (j<nzb) { 4291 batmp[j] = 0; 4292 } 4293 } 4294 } 4295 } 4296 4297 // copy A into B with CSR format -- these two loops can be fused 4298 for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y 4299 if (rowb < end_i) { 4300 const PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 4301 const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0; 4302 const PetscScalar *av = aa_d + ai_d[rowa]; 4303 PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4304 /* load in initial (unfactored row) */ 4305 for (int j=threadIdx.x ; j<nza ; j += blockDim.x) { 4306 if (j<nza) { 4307 PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart; 4308 PetscScalar vala = av[j]; 4309 batmp[idx] = vala; 4310 } 4311 } 4312 } 4313 } 4314 } 4315 // print AIJ_BAND 4316 __global__ 4317 void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[]) 4318 { 4319 // debug 4320 if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){ 4321 printf("B (AIJ) n=%d:\n",(int)n); 4322 for (int rowb=0;rowb<n;rowb++) { 4323 const PetscInt nz = bi_csr[rowb+1] - bi_csr[rowb]; 4324 const PetscScalar *batmp = ba_csr + bi_csr[rowb]; 4325 for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j])); 4326 printf(" bi=%d\n",bi_csr[rowb+1]); 4327 } 4328 } 4329 } 4330 // Band LU kernel --- ba_csr bi_csr 4331 __global__ 4332 void __launch_bounds__(1024,1) 4333 mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[]) 4334 { 4335 extern __shared__ PetscInt smemInt[]; 4336 PetscInt *sm_pkIdx = &smemInt[0]; 4337 const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf; 4338 const PetscInt field = blockIdx.x, blkIdx = blockIdx.y; 4339 const PetscInt start = field*nloc, end = start + nloc; 4340 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4341 auto g = cooperative_groups::this_grid(); 4342 #endif 4343 // A22 panel update for each row A(1,:) and col A(:,1) 4344 for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) { 4345 PetscInt tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears 4346 const PetscInt nzUd = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first 4347 const PetscInt nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y)); 4348 PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset; 4349 const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end) 4350 const PetscScalar Bdd = *pBdd; 4351 const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y; 4352 for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */ 4353 if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */ 4354 const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block 4355 PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx; 4356 *Aid = *Aid/Bdd; 4357 sm_pkIdx[threadIdx.y] = kIdx; 4358 } 4359 __syncthreads(); // synch on threadIdx.x only 4360 if (idx < nzUd) { /* assuming symmetric structure */ 4361 PetscInt kIdx = sm_pkIdx[threadIdx.y]; 4362 PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx; 4363 PetscScalar *Aij = Aid + 1; 4364 PetscScalar Lid = *Aid; 4365 for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) { 4366 if (jIdx<nzUd) { 4367 Aij[jIdx] -= Lid*baUd[jIdx]; 4368 } 4369 } 4370 } 4371 } 4372 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4373 g.sync(); 4374 #else 4375 __syncthreads(); 4376 #endif 4377 } /* endof for (i=0; i<n; i++) { */ 4378 } 4379 4380 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec); 4381 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info) 4382 { 4383 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 4384 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 4385 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 4386 Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 4387 Mat_SeqAIJCUSPARSEMultStruct *matstructA; 4388 CsrMatrix *matrixA; 4389 PetscErrorCode ierr; 4390 cudaError_t cerr; 4391 const PetscInt n=A->rmap->n, *ic, *r; 4392 const int *ai_d, *aj_d; 4393 const PetscScalar *aa_d; 4394 PetscScalar *ba_t = cusparseTriFactors->a_band_d; 4395 int *bi_t = cusparseTriFactors->i_band_d; 4396 PetscContainer container; 4397 int Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1; 4398 4399 PetscFunctionBegin; 4400 if (A->rmap->n == 0) { 4401 PetscFunctionReturn(0); 4402 } 4403 // cusparse setup 4404 if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA"); 4405 matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; // matstruct->cprowIndices 4406 if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct"); 4407 matrixA = (CsrMatrix*)matstructA->mat; 4408 if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat"); 4409 4410 // factor: get Nf if available 4411 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4412 if (container) { 4413 PetscInt *pNf=NULL; 4414 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4415 Nf = (*pNf)%1000; 4416 if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use 4417 } else Nf = 1; 4418 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4419 4420 // get data 4421 ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data()); 4422 ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 4423 aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data()); 4424 aa_d = thrust::raw_pointer_cast(matrixA->values->data().get()); 4425 r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data()); 4426 4427 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4428 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4429 { 4430 int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf; 4431 int gpuid; 4432 cudaDeviceProp prop; 4433 cudaGetDevice(&gpuid); 4434 cudaGetDeviceProperties(&prop, gpuid); 4435 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 4436 Ni = 1/nconcurrent; 4437 Ni = 1; 4438 #else 4439 nsm = prop.multiProcessorCount; 4440 Ni = nsm/Nf/nconcurrent; 4441 #endif 4442 team_size = bw/Ni + !!(bw%Ni); 4443 nVec = PetscMin(bw, 1024/team_size); 4444 ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr); 4445 { 4446 dim3 dimBlockTeam(nVec,team_size); 4447 dim3 dimBlockLeague(Nf,Ni); 4448 mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t); 4449 CHECK_LAUNCH_ERROR(); // does a sync 4450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4451 void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t}; 4452 cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL); 4453 #else 4454 mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t); 4455 #endif 4456 CHECK_LAUNCH_ERROR(); // does a sync 4457 #if defined(PETSC_USE_LOG) 4458 ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr); 4459 #endif 4460 } 4461 } 4462 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4463 4464 /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */ 4465 B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND; 4466 B->ops->solvetranspose = NULL; // need transpose 4467 B->ops->matsolve = NULL; 4468 B->ops->matsolvetranspose = NULL; 4469 4470 PetscFunctionReturn(0); 4471 } 4472 4473 static PetscErrorCode MatrixNfDestroy(void *ptr) 4474 { 4475 PetscInt *nf = (PetscInt *)ptr; 4476 PetscErrorCode ierr; 4477 PetscFunctionBegin; 4478 ierr = PetscFree(nf);CHKERRQ(ierr); 4479 PetscFunctionReturn(0); 4480 } 4481 4482 PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4483 { 4484 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b; 4485 IS isicol; 4486 PetscErrorCode ierr; 4487 cudaError_t cerr; 4488 const PetscInt *ic,*ai=a->i,*aj=a->j; 4489 PetscScalar *ba_t; 4490 int *bi_t; 4491 PetscInt i,n=A->rmap->n,Nf; 4492 PetscInt nzBcsr,bwL,bwU; 4493 PetscBool missing; 4494 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 4495 PetscContainer container; 4496 4497 PetscFunctionBegin; 4498 if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square"); 4499 ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr); 4500 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i); 4501 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors"); 4502 ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr); 4503 if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported"); 4504 4505 // factor: get Nf if available 4506 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4507 if (container) { 4508 PetscInt *pNf=NULL; 4509 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4510 Nf = (*pNf)%1000; 4511 ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr); 4512 ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr); 4513 *pNf = Nf; 4514 ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr); 4515 ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr); 4516 ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr); 4517 ierr = PetscContainerDestroy(&container);CHKERRQ(ierr); 4518 } else Nf = 1; 4519 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4520 4521 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4522 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4523 4524 ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 4525 ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr); 4526 b = (Mat_SeqAIJ*)(B)->data; 4527 4528 /* get band widths, MatComputeBandwidth should take a reordering ic and do this */ 4529 bwL = bwU = 0; 4530 for (int rwb=0; rwb<n; rwb++) { 4531 const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb]; 4532 for (int j=0;j<anz;j++) { 4533 PetscInt colb = ic[ajtmp[j]]; 4534 if (colb<rwa) { // L 4535 if (rwa-colb > bwL) bwL = rwa-colb; 4536 } else { 4537 if (colb-rwa > bwU) bwU = colb-rwa; 4538 } 4539 } 4540 } 4541 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4542 /* only support structurally symmetric, but it might work */ 4543 if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU); 4544 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 4545 nzBcsr = n + (2*n-1)*bwU - bwU*bwU; 4546 b->maxnz = b->nz = nzBcsr; 4547 cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz 4548 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 4549 cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops 4550 cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr); 4551 cusparseTriFactors->a_band_d = ba_t; 4552 cusparseTriFactors->i_band_d = bi_t; 4553 /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */ 4554 ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr); 4555 { 4556 dim3 dimBlockTeam(1,128); 4557 dim3 dimBlockLeague(Nf,1); 4558 mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t); 4559 } 4560 CHECK_LAUNCH_ERROR(); // does a sync 4561 4562 // setup data 4563 if (!cusparseTriFactors->rpermIndices) { 4564 const PetscInt *r; 4565 4566 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4567 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 4568 cusparseTriFactors->rpermIndices->assign(r, r+n); 4569 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4570 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4571 } 4572 /* upper triangular indices */ 4573 if (!cusparseTriFactors->cpermIndices) { 4574 const PetscInt *c; 4575 4576 ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr); 4577 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 4578 cusparseTriFactors->cpermIndices->assign(c, c+n); 4579 ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr); 4580 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4581 } 4582 4583 /* put together the new matrix */ 4584 b->free_a = PETSC_FALSE; 4585 b->free_ij = PETSC_FALSE; 4586 b->singlemalloc = PETSC_FALSE; 4587 b->ilen = NULL; 4588 b->imax = NULL; 4589 b->row = isrow; 4590 b->col = iscol; 4591 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4592 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4593 b->icol = isicol; 4594 ierr = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr); 4595 4596 B->factortype = MAT_FACTOR_LU; 4597 B->info.factor_mallocs = 0; 4598 B->info.fill_ratio_given = 0; 4599 4600 if (ai[n]) { 4601 B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]); 4602 } else { 4603 B->info.fill_ratio_needed = 0.0; 4604 } 4605 #if defined(PETSC_USE_INFO) 4606 if (ai[n] != 0) { 4607 PetscReal af = B->info.fill_ratio_needed; 4608 ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr); 4609 } else { 4610 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 4611 } 4612 #endif 4613 if (a->inode.size) { 4614 ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr); 4615 } 4616 ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr); 4617 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND; 4618 B->offloadmask = PETSC_OFFLOAD_GPU; 4619 4620 PetscFunctionReturn(0); 4621 } 4622 4623 /* Use -pc_factor_mat_solver_type cusparseband */ 4624 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type) 4625 { 4626 PetscFunctionBegin; 4627 *type = MATSOLVERCUSPARSEBAND; 4628 PetscFunctionReturn(0); 4629 } 4630 4631 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B) 4632 { 4633 PetscErrorCode ierr; 4634 PetscInt n = A->rmap->n; 4635 4636 PetscFunctionBegin; 4637 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 4638 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 4639 (*B)->factortype = ftype; 4640 (*B)->useordering = PETSC_TRUE; 4641 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4642 4643 if (ftype == MAT_FACTOR_LU) { 4644 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 4645 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 4646 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND; 4647 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types"); 4648 4649 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 4650 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr); 4651 PetscFunctionReturn(0); 4652 } 4653 4654 #define WARP_SIZE 32 4655 template <typename T> 4656 __forceinline__ __device__ 4657 T wreduce(T a) 4658 { 4659 T b; 4660 #pragma unroll 4661 for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) { 4662 b = __shfl_down_sync(0xffffffff, a, i); 4663 a += b; 4664 } 4665 return a; 4666 } 4667 // reduce in a block, returns result in thread 0 4668 template <typename T, int BLOCK_SIZE> 4669 __device__ 4670 T breduce(T a) 4671 { 4672 constexpr int NWARP = BLOCK_SIZE/WARP_SIZE; 4673 __shared__ double buf[NWARP]; 4674 int wid = threadIdx.x / WARP_SIZE; 4675 int laneid = threadIdx.x % WARP_SIZE; 4676 T b = wreduce<T>(a); 4677 if (laneid == 0) 4678 buf[wid] = b; 4679 __syncthreads(); 4680 if (wid == 0) { 4681 if (threadIdx.x < NWARP) 4682 a = buf[threadIdx.x]; 4683 else 4684 a = 0; 4685 for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) { 4686 a += __shfl_down_sync(0xffffffff, a, i); 4687 } 4688 } 4689 return a; 4690 } 4691 4692 4693 // Band LU kernel --- ba_csr bi_csr 4694 template <int BLOCK_SIZE> 4695 __global__ 4696 void __launch_bounds__(256,1) 4697 mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[]) 4698 { 4699 const PetscInt Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz; 4700 const PetscScalar *pLi; 4701 const int tid = threadIdx.x; 4702 4703 /* Next, solve L */ 4704 pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field 4705 for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) { 4706 const PetscInt col = locDD<bw ? start : (glbDD-bw); 4707 PetscScalar t = 0; 4708 for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) { 4709 t += pLi[idx]*x[j]; 4710 } 4711 #if defined(PETSC_USE_COMPLEX) 4712 PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t); 4713 PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti)); 4714 t = tt; 4715 #else 4716 t = breduce<PetscReal,BLOCK_SIZE>(t); 4717 #endif 4718 if (threadIdx.x == 0) 4719 x[glbDD] -= t; // /1.0 4720 __syncthreads(); 4721 // inc 4722 pLi += glbDD-col; // get to diagonal 4723 if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset 4724 else pLi += bw; 4725 pLi += 1; // skip to next row 4726 if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear) 4727 } 4728 /* Then, solve U */ 4729 pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal) 4730 if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row 4731 for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) { 4732 const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U 4733 PetscScalar t = 0; 4734 for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) { 4735 t += pLi[-idx]*x[j]; 4736 } 4737 #if defined(PETSC_USE_COMPLEX) 4738 PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t); 4739 PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti)); 4740 t = tt; 4741 #else 4742 t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t)); 4743 #endif 4744 pLi -= col-glbDD; // diagonal 4745 if (threadIdx.x == 0) { 4746 x[glbDD] -= t; 4747 x[glbDD] /= pLi[0]; 4748 } 4749 __syncthreads(); 4750 // inc past L to start of previous U 4751 pLi -= bw+1; 4752 if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner 4753 if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner 4754 } 4755 } 4756 4757 static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx) 4758 { 4759 const PetscScalar *barray; 4760 PetscScalar *xarray; 4761 thrust::device_ptr<const PetscScalar> bGPU; 4762 thrust::device_ptr<PetscScalar> xGPU; 4763 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4764 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 4765 PetscInt n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf; 4766 PetscErrorCode ierr; 4767 cudaError_t cerr; 4768 PetscContainer container; 4769 4770 PetscFunctionBegin; 4771 if (A->rmap->n == 0) { 4772 PetscFunctionReturn(0); 4773 } 4774 // factor: get Nf if available 4775 ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 4776 if (container) { 4777 PetscInt *pNf=NULL; 4778 ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 4779 Nf = (*pNf)%1000; 4780 } else Nf = 1; 4781 if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 4782 4783 /* Get the GPU pointers */ 4784 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 4785 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 4786 xGPU = thrust::device_pointer_cast(xarray); 4787 bGPU = thrust::device_pointer_cast(barray); 4788 4789 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4790 /* First, reorder with the row permutation */ 4791 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 4792 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 4793 tempGPU->begin()); 4794 constexpr int block = 128; 4795 mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get()); 4796 CHECK_LAUNCH_ERROR(); // does a sync 4797 4798 /* Last, reorder with the column permutation */ 4799 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 4800 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 4801 xGPU); 4802 4803 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 4804 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 4805 cerr = WaitForCUDA();CHKERRCUDA(cerr); 4806 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4807 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 4808 PetscFunctionReturn(0); 4809 } 4810